From bb9e76f47c374f9a84503b9a4b92b16070f4c23f Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Tue, 18 Nov 2025 10:04:28 -0500 Subject: [PATCH 01/10] [rocprofiler-compute] Bump version and update changelog ahead of ROCm 7.2 release (#1908) --- projects/rocprofiler-compute/CHANGELOG.md | 22 +++++++++++----------- projects/rocprofiler-compute/VERSION | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 1e13dd242d7..d9ba6781148 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -4,30 +4,30 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ## Unreleased +## ROCm Compute Profiler 3.4.0 for ROCm 7.2.0 + ### Added -* Add `--list-blocks ` option to general options to list available IP blocks on specified arch (similar to `--list-metrics`), cannot be used with `--block`. +* `--list-blocks ` option to general options. It lists the available IP blocks on the specified arch (similar to `--list-metrics`), however cannot be used with `--block`. -* Added `config_delta/gfx950_diff.yaml` to analysis config yamls to track the revision between a gfx9 architecture against the latest supported architecture gfx950 +* `config_delta/gfx950_diff.yaml` to analysis config YAMLs to track the revision between the gfx9xx GPUs against the latest supported gfx950 GPUs. * Analysis db features - * Add support for per kernel metrics analysis. - * Add support for dispatch timeline analysis. - * Show duration as median in addition to mean in kernel view. + * Adds support for per kernel metrics analysis. + * Adds support for dispatch timeline analysis. + * Shows duration as median in addition to mean in kernel view. ### Changed -* `-b/--block` accepts block alias(es) (See block aliases using command-line option `--list-blocks `). +* `-b/--block` accepts block alias(es). See block aliases using command-line option `--list-blocks `. -* analysis configs yamls are now managed with the new config management workflow in `tools/config_management/` +* Analysis configs YAMLs are now managed with the new config management workflow in `tools/config_management/`. * `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications. - ### Removed -* Removed `database` mode from `rocprofiler-compute`. This is to move our focus from grafana - and mongodb integration to other visualization methods such as: - * Analysis DB based Visualizer (upcoming) + +* Removed `database` mode from ROCm Compute Profiler in favor of other visualization methods, rather than Grafana and MongoDB integration, such as the upcoming Analysis DB-based Visualizer. * Plotly server based standalone GUI * Commandline based Textual User Interface diff --git a/projects/rocprofiler-compute/VERSION b/projects/rocprofiler-compute/VERSION index bea438e9ade..18091983f59 100644 --- a/projects/rocprofiler-compute/VERSION +++ b/projects/rocprofiler-compute/VERSION @@ -1 +1 @@ -3.3.1 +3.4.0 From 7b5042811bbb4d03228d749439e7162e608578ad Mon Sep 17 00:00:00 2001 From: ywang103-amd Date: Tue, 18 Nov 2025 11:32:21 -0500 Subject: [PATCH 02/10] roll back json file processing logic for pc sampling (#1835) * roll back json file processing logic for pc sampling * format cmake files * Revert "format cmake files" This reverts commit e64df65a8f30abcb6738e3a0d7ffd4270bd1d302. --- .../rocprofiler-compute/src/utils/parser.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 6b59b34d6a5..6e9499acd31 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -1320,27 +1320,27 @@ def search_pc_sampling_record( return None # Convert to sorted list of tuples: - # (code_object_id, inst_index, code_object_offset, count, count_issued, - # count_stalled, stall_reason) sorted_counts = sorted( [ ( code_object_id, - info[3], # inst_index - offset, - info[0], # count + code_object_offset, + inst_index, + info[0], # total_count info[1], # count_issued info[2], # count_stalled - # For info[4] (stall_reason dict), remove the zero entries, - # sorting the remaining items by their values in descending order sorted( - ((k, v) for k, v in info[4].items() if v > 0), + ((k, v) for k, v in info[3].items() if v > 0), key=lambda item: item[1], reverse=True, ), # sorted stall reasons sorted(info[4]), # sorted dispatch_ids list ) - for (code_object_id, offset), info in grouped_data.items() + for ( + code_object_id, + code_object_offset, + inst_index, + ), info in grouped_data.items() ], key=lambda x: (x[0], x[1], x[2]), ) From 5fc2009e9d55b0652abf05751c878ac012ded2ce Mon Sep 17 00:00:00 2001 From: cfallows-amd Date: Tue, 18 Nov 2025 14:51:05 -0500 Subject: [PATCH 03/10] Add tencentos to roofline binary detection (#1830) Force tencentos to use rhel-based bin since tencent is branched off of centos, which is branch of fedora. Verified rocprof-compute run correctly selects bin to use, and the roofline benchmark values look similar between runs on rhel vs tencentos4 docker images on same system. Signed-off-by: Carrie Fallows --- projects/rocprofiler-compute/src/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py index 2362f58a5f6..dd819a4973c 100644 --- a/projects/rocprofiler-compute/src/utils/utils.py +++ b/projects/rocprofiler-compute/src/utils/utils.py @@ -1200,7 +1200,7 @@ def detect_roofline(mspec: Any) -> dict[str, str]: # noqa: ANN401 elif "debian" in id_list: distro = "22.04" - elif "fedora" in id_list: + elif ("fedora" in id_list) or ("tencentos" in id_list): distro = "platform:el8" elif "suse" in id_list: From 659775a2fd58d2081bb07380694dc5643d070bfe Mon Sep 17 00:00:00 2001 From: abchoudh-amd Date: Wed, 19 Nov 2025 15:36:08 +0530 Subject: [PATCH 04/10] Split roofline tests, and fix none outputs (#1913) * Split roofline tests * Use N/A for missing values * Test eval_expression for no valid data * Fixed tests * Updated Changelog for N/A * Fixed platform specific test failure --- projects/rocprofiler-compute/CHANGELOG.md | 3 + projects/rocprofiler-compute/CMakeLists.txt | 27 +- projects/rocprofiler-compute/pyproject.toml | 3 +- .../rocprofiler-compute/src/utils/parser.py | 16 +- projects/rocprofiler-compute/src/utils/tty.py | 2 +- .../tests/test_analyze_commands.py | 38 +- .../tests/test_profile_general.py | 359 +++++++++++++++++- 7 files changed, 413 insertions(+), 35 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index d9ba6781148..755e2e34c9b 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -25,6 +25,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications. +* Empty cells replaced with `N/A` for unavailable metrics in analysis. + + ### Removed * Removed `database` mode from ROCm Compute Profiler in favor of other visualization methods, rather than Grafana and MongoDB integration, such as the upcoming Analysis DB-based Visualizer. diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 971ecaa789e..47a7c976985 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -283,10 +283,19 @@ add_test( ) add_test( - NAME test_profile_roofline + NAME test_profile_roofline_1 COMMAND - ${Python3_EXECUTABLE} -m pytest -m roofline - --junitxml=tests/test_profile_roofline.xml ${COV_OPTION} + ${Python3_EXECUTABLE} -m pytest -m roofline_1 + --junitxml=tests/test_profile_roofline_1.xml ${COV_OPTION} + ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} +) + +add_test( + NAME test_profile_roofline_2 + COMMAND + ${Python3_EXECUTABLE} -m pytest -m roofline_2 + --junitxml=tests/test_profile_roofline_2.xml ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_profile_general.py WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} ) @@ -335,8 +344,13 @@ set_tests_properties( test_profile_sort test_profile_misc test_profile_path - test_profile_roofline - PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 + test_profile_roofline_1 + test_profile_roofline_2 + test_profile_section + test_profile_pc_sampling + test_profile_sets_func + test_profile_live_attach_detach + PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800 ) # --------------------------- @@ -437,7 +451,8 @@ if(${ENABLE_COVERAGE}) test_profile_sort test_profile_misc test_profile_path - test_profile_roofline + test_profile_roofline_1 + test_profile_roofline_2 test_profile_section test_profile_sets_func test_analyze_commands diff --git a/projects/rocprofiler-compute/pyproject.toml b/projects/rocprofiler-compute/pyproject.toml index 2f76c714789..713c67928f1 100644 --- a/projects/rocprofiler-compute/pyproject.toml +++ b/projects/rocprofiler-compute/pyproject.toml @@ -103,7 +103,8 @@ markers = [ "sets_perf", "pc_sampling", "live_attach_detach", - "roofline", + "roofline_1", + "roofline_2", "path", "sci_notion", ] diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 6e9499acd31..a2e67e906b0 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -166,7 +166,7 @@ def to_avg( else: return float(a) elif isinstance(a, str): - if not a: + if not a or a == "N/A": return np.nan return float(a) else: @@ -347,29 +347,27 @@ def eval_expression(self, expr: str) -> Union[str, float, int]: ) if eval_result is None or np.isnan(eval_result).any(): - return "" + return "N/A" else: return eval_result except (TypeError, NameError, KeyError) as exception: if "empirical_peak" in str(exception): - console_warning( - f"Missing empirical peak data: {exception}. Using empty value." - ) - return "" + console_warning(f"Missing empirical peak data: {exception}.") + return "N/A" else: console_warning(f"Failed to evaluate expression '{expr}': {exception}.") - return "" + return "N/A" except AttributeError as attribute_error: if str(attribute_error) == "'NoneType' object has no attribute 'get'": console_warning( f"Failed to evaluate expression '{expr}': {attribute_error}." ) - return "" + return "N/A" else: console_error("analysis", str(attribute_error)) - return "" + return "N/A" def build_eval_string(equation: str, coll_level: str, config: dict) -> str: diff --git a/projects/rocprofiler-compute/src/utils/tty.py b/projects/rocprofiler-compute/src/utils/tty.py index 76587b2ef01..396abc13b79 100644 --- a/projects/rocprofiler-compute/src/utils/tty.py +++ b/projects/rocprofiler-compute/src/utils/tty.py @@ -351,7 +351,7 @@ def process_table_data( # Base run - just add the rounded values cur_df_copy = copy.deepcopy(cur_df) cur_df_copy[header] = [ - (round(float(x), args.decimal) if x != "" else x) + (round(float(x), args.decimal) if x != "N/A" else x) for x in base_df[header] ] result_df = pd.concat([result_df, cur_df_copy[header]], axis=1) diff --git a/projects/rocprofiler-compute/tests/test_analyze_commands.py b/projects/rocprofiler-compute/tests/test_analyze_commands.py index f9e134872ed..e726848bab3 100644 --- a/projects/rocprofiler-compute/tests/test_analyze_commands.py +++ b/projects/rocprofiler-compute/tests/test_analyze_commands.py @@ -26,7 +26,7 @@ import os import shutil from pathlib import Path -from unittest.mock import Mock +from unittest.mock import Mock, patch import pandas as pd import pytest @@ -1379,6 +1379,42 @@ def test_update_functions_coverage(): assert result[0].isupper() +def test_metric_evaluation_no_valid_data(): + """Test emetric evaluation with no valid data""" + import numpy as np + + from utils.parser import MetricEvaluator + + metric_evaluator = MetricEvaluator({}, {}, {}) + with patch("builtins.eval") as mock_eval, patch("builtins.compile"): + # Test when eval returns None + mock_eval.return_value = None + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + # Test when eval returns NaN + mock_eval.return_value = np.nan + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + # Test when eval raises an exception + mock_eval.side_effect = TypeError("Mock exception") + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + mock_eval.side_effect = NameError("empirical_peak") + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + mock_eval.side_effect = KeyError("Some KeyError") + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + with patch("sys.exit"): + mock_eval.side_effect = AttributeError("Some AttributeError") + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + mock_eval.side_effect = AttributeError( + "'NoneType' object has no attribute 'get'" + ) + assert metric_evaluator.eval_expression("Mock Metric") == "N/A" + + @pytest.fixture def sample_time_data(): return pd.DataFrame({ diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 1150b16a92d..9a936224aad 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -586,8 +586,116 @@ def test_path_rocpd( test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline -def test_roof_kernel_names(binary_handler_profile_rocprof_compute): +@pytest.mark.path +def test_path_csv( + binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute +): + workload_dir = test_utils.get_output_dir() + options = ["--format-rocprof-output", "csv"] + binary_handler_profile_rocprof_compute(config, workload_dir, options) + + file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels) + all_csvs_mi100 = sorted([ + "SQC_DCACHE_INFLIGHT_LEVEL.csv", + "SQC_ICACHE_INFLIGHT_LEVEL.csv", + "SQ_IFETCH_LEVEL.csv", + "SQ_INST_LEVEL_LDS.csv", + "SQ_LEVEL_WAVES.csv", + "pmc_perf.csv", + "pmc_perf_0.csv", + "pmc_perf_1.csv", + "pmc_perf_2.csv", + "pmc_perf_3.csv", + "pmc_perf_4.csv", + "pmc_perf_5.csv", + "pmc_perf_6.csv", + "sysinfo.csv", + ]) + all_csvs_mi200 = sorted([ + "SQC_DCACHE_INFLIGHT_LEVEL.csv", + "SQC_ICACHE_INFLIGHT_LEVEL.csv", + "SQ_IFETCH_LEVEL.csv", + "SQ_INST_LEVEL_LDS.csv", + "SQ_INST_LEVEL_SMEM.csv", + "SQ_INST_LEVEL_VMEM.csv", + "SQ_LEVEL_WAVES.csv", + "pmc_perf.csv", + "pmc_perf_0.csv", + "pmc_perf_1.csv", + "pmc_perf_2.csv", + "pmc_perf_3.csv", + "pmc_perf_4.csv", + "pmc_perf_5.csv", + "sysinfo.csv", + ]) + all_csvs_mi300 = sorted([ + "SQC_DCACHE_INFLIGHT_LEVEL.csv", + "SQC_ICACHE_INFLIGHT_LEVEL.csv", + "SQ_IFETCH_LEVEL.csv", + "SQ_INST_LEVEL_LDS.csv", + "SQ_INST_LEVEL_SMEM.csv", + "SQ_INST_LEVEL_VMEM.csv", + "SQ_LEVEL_WAVES.csv", + "pmc_perf.csv", + "pmc_perf_0.csv", + "pmc_perf_1.csv", + "pmc_perf_2.csv", + "pmc_perf_3.csv", + "pmc_perf_4.csv", + "pmc_perf_5.csv", + "sysinfo.csv", + ]) + all_csvs_mi350 = sorted([ + "SQC_DCACHE_INFLIGHT_LEVEL.csv", + "SQC_ICACHE_INFLIGHT_LEVEL.csv", + "SQ_IFETCH_LEVEL.csv", + "SQ_INST_LEVEL_LDS.csv", + "SQ_INST_LEVEL_SMEM.csv", + "SQ_INST_LEVEL_VMEM.csv", + "SQ_LEVEL_WAVES.csv", + "pmc_perf.csv", + "pmc_perf_0.csv", + "pmc_perf_1.csv", + "pmc_perf_2.csv", + "pmc_perf_3.csv", + "pmc_perf_4.csv", + "pmc_perf_5.csv", + "pmc_perf_6.csv", + "pmc_perf_7.csv", + "pmc_perf_8.csv", + "pmc_perf_9.csv", + "pmc_perf_10.csv", + "pmc_perf_11.csv", + "pmc_perf_12.csv", + "sysinfo.csv", + ]) + + if soc == "MI100": + assert sorted(list(file_dict.keys())) == all_csvs_mi100 + elif soc == "MI200": + assert sorted(list(file_dict.keys())) == all_csvs_mi200 + elif "MI300" in soc: + assert sorted(list(file_dict.keys())) == all_csvs_mi300 + elif "MI350" in soc: + assert sorted(list(file_dict.keys())) == all_csvs_mi350 + else: + print(f"This test is not supported for {soc}") + assert 0 + + validate(inspect.stack()[0][3], workload_dir, file_dict) + + test_utils.clean_output_dir(config["cleanup"], workload_dir) + + +@pytest.mark.roofline_1 +def test_roof_basic_validation(binary_handler_profile_rocprof_compute): + """ + Test basic roofline PDF generation with full validation pipeline. + This test runs the complete validation flow including counter logging + and metric comparison (if enabled in config). Validates that roofline PDFs + are generated with the integrated multi-subplot layout (roofline plot + + plot points table + kernel names table). + """ if soc in ("MI100"): # roofline is not supported on MI100 assert True @@ -615,7 +723,7 @@ def test_roof_kernel_names(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roof_multiple_data_types(binary_handler_profile_rocprof_compute): """Test roofline with multiple data types""" if soc in ("MI100"): @@ -653,7 +761,7 @@ def test_roof_multiple_data_types(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roof_invalid_data_type(binary_handler_profile_rocprof_compute): """Test roofline with invalid data type""" if soc in ("MI100"): @@ -683,7 +791,7 @@ def test_roof_invalid_data_type(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roof_file_validation(binary_handler_profile_rocprof_compute): """Test file validation paths in roofline""" if soc in ("MI100"): @@ -712,7 +820,7 @@ def test_roof_file_validation(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roof_rocpd(binary_handler_profile_rocprof_compute): if soc == "MI100": pytest.skip("Roofline not supported on MI100") @@ -793,7 +901,7 @@ def check_cols(table_name, orm_obj): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roofline_workload_dir_not_set_error(): """ Test roof_setup() error: "Workload directory is not set. Cannot perform setup." @@ -852,7 +960,7 @@ def __init__(self): pytest.skip("Could not import roofline module for direct testing") -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roof_workload_dir_validation(binary_handler_profile_rocprof_compute): if soc in ("MI100"): assert True @@ -876,7 +984,7 @@ def test_roof_workload_dir_validation(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roofline_empty_kernel_names_handling(binary_handler_profile_rocprof_compute): """ Test empirical_roofline() when num_kernels == 0 @@ -903,7 +1011,7 @@ def test_roofline_empty_kernel_names_handling(binary_handler_profile_rocprof_com test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roofline_kernel_filter(binary_handler_profile_rocprof_compute): """ Test roofline multi-attempt profiling with `--kernel` @@ -942,7 +1050,7 @@ def test_roofline_kernel_filter(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_1 def test_roofline_unsupported_datatype_error(binary_handler_profile_rocprof_compute): """ Test datatype validation error in empirical_roofline() @@ -968,7 +1076,7 @@ def test_roofline_unsupported_datatype_error(binary_handler_profile_rocprof_comp test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roof_plot_modes(binary_handler_profile_rocprof_compute): if soc in ("MI100"): assert True @@ -1015,7 +1123,7 @@ def test_roof_plot_modes(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roof_cli_plot_generation(binary_handler_profile_rocprof_compute): if soc in ("MI100"): assert True @@ -1041,7 +1149,7 @@ def test_roof_cli_plot_generation(binary_handler_profile_rocprof_compute): pytest.skip("plotext not available for CLI testing") -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roof_error_handling(binary_handler_profile_rocprof_compute): if soc in ("MI100"): assert True @@ -1061,7 +1169,7 @@ def test_roof_error_handling(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute): """ Test handling of missing roofline.csv file @@ -1115,7 +1223,7 @@ def __init__(self): pytest.skip("Could not import roofline module for direct testing") -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute): """ Test CLI plot generation with invalid datatype @@ -1167,7 +1275,7 @@ def __init__(self): pytest.skip("Could not import roofline module for direct testing") -@pytest.mark.roofline +@pytest.mark.roofline_2 def test_roofline_ceiling_data_validation(binary_handler_profile_rocprof_compute): """ Test ceiling data validation in generate_plot() @@ -1187,6 +1295,223 @@ def test_roofline_ceiling_data_validation(binary_handler_profile_rocprof_compute test_utils.clean_output_dir(config["cleanup"], workload_dir) +@pytest.mark.roofline_2 +def test_roofline_plot_points_data_generation(): + """ + Test that plot points data structure is correctly generated with: + - Symbol assignments + - AI values (FLOPs/Byte) + - Performance values (GFLOPs/s) + - Memory/Compute bound status + - Cache level information + """ + if soc in ("MI100"): + pytest.skip("Skipping roofline test for MI100") + return + + sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + + try: + from roofline import Roofline + from utils.specs import generate_machine_specs + + class MockArgs: + def __init__(self): + self.roof_only = True + self.mem_level = "ALL" + self.sort = "ALL" + self.roofline_data_type = ["FP32"] + + args = MockArgs() + mspec = generate_machine_specs(None, None) + + mock_ai_data = { + "ai_l1": [[0.5, 1.2], [100.0, 150.0]], + "ai_l2": [[0.3, 0.8], [80.0, 120.0]], + "ai_hbm": [[0.1, 0.4], [50.0, 90.0]], + "kernelNames": ["kernel_A", "kernel_B"], + } + + mock_ceiling_data = { + "l1": [[0.01, 10], [10, 1000], 100], + "l2": [[0.01, 10], [10, 800], 80], + "hbm": [[0.01, 10], [10, 500], 50], + "valu": [[1, 100], [200, 200], 200], + "mfma": [[1, 100], [500, 500], 500], + } + + plot_points_data = [] + cache_colors = { + "ai_l1": "blue", + "ai_l2": "green", + "ai_hbm": "red", + } + + roofline_instance = Roofline(args, mspec) + + for cache_level in ["ai_l1", "ai_l2", "ai_hbm"]: + if cache_level in mock_ai_data: + x_vals = mock_ai_data[cache_level][0] + y_vals = mock_ai_data[cache_level][1] + num_kernels = len(mock_ai_data["kernelNames"]) + + for i in range(min(len(x_vals), num_kernels)): + if x_vals[i] > 0 and y_vals[i] > 0: + status = roofline_instance._determine_kernel_bound_status( + ai_value=x_vals[i], + performance=y_vals[i], + cache_level=cache_level, + ceiling_data=mock_ceiling_data, + ) + + plot_points_data.append({ + "symbol": None, + "color": cache_colors.get(cache_level, "gray"), + "cache_level": cache_level.replace("ai_", "", 1).upper(), + "ai": f"{x_vals[i]:.2f}", + "performance": f"{y_vals[i]:.2f}", + "status": status, + "kernel_idx": i, + }) + + assert len(plot_points_data) > 0, "Plot points data should not be empty" + + for point in plot_points_data: + assert "cache_level" in point + assert "ai" in point + assert "performance" in point + assert "status" in point + assert "kernel_idx" in point + assert "color" in point + + assert point["cache_level"] in ["L1", "L2", "HBM"] + + assert point["status"] in ["Memory Bound", "Compute Bound", "Unknown"] + + assert isinstance(point["ai"], str) + assert isinstance(point["performance"], str) + + except ImportError: + pytest.skip("Could not import roofline module for direct testing") + + +@pytest.mark.roofline_2 +def test_roofline_bound_status_calculation(): + """ + Test _determine_kernel_bound_status() correctly classifies kernels as + Memory Bound or Compute Bound based on their AI and performance vs ceilings. + """ + if soc in ("MI100"): + pytest.skip("Skipping roofline test for MI100") + return + + sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + + try: + from roofline import Roofline + from utils.specs import generate_machine_specs + + class MockArgs: + def __init__(self): + self.roof_only = True + self.mem_level = "ALL" + self.sort = "ALL" + self.roofline_data_type = ["FP32"] + + args = MockArgs() + mspec = generate_machine_specs(None, None) + roofline_instance = Roofline(args, mspec) + + ceiling_data = { + "hbm": [[0.01, 10], [10, 1000], 100], + "valu": [[1, 100], [200, 200], 200], + "mfma": [[1, 100], [500, 500], 500], + } + + status1 = roofline_instance._determine_kernel_bound_status( + ai_value=1.0, + performance=100.0, + cache_level="ai_hbm", + ceiling_data=ceiling_data, + ) + assert status1 == "Memory Bound", f"Expected Memory Bound, got {status1}" + + status2 = roofline_instance._determine_kernel_bound_status( + ai_value=5.0, + performance=150.0, + cache_level="ai_hbm", + ceiling_data=ceiling_data, + ) + assert status2 == "Compute Bound", f"Expected Compute Bound, got {status2}" + + status3 = roofline_instance._determine_kernel_bound_status( + ai_value=1.0, + performance=100.0, + cache_level="ai_l1", + ceiling_data=ceiling_data, + ) + assert status3 == "Unknown", f"Expected Unknown, got {status3}" + + bad_ceiling_data = { + "hbm": [100], + } + status4 = roofline_instance._determine_kernel_bound_status( + ai_value=1.0, + performance=100.0, + cache_level="ai_hbm", + ceiling_data=bad_ceiling_data, + ) + assert status4 == "Unknown", f"Expected Unknown for bad data, got {status4}" + + except ImportError: + pytest.skip("Could not import roofline module for direct testing") + + +@pytest.mark.roofline_2 +def test_roofline_many_kernels_dynamic_height(binary_handler_profile_rocprof_compute): + """ + Test roofline PDF generation with many kernels (10+) to verify: + - Dynamic height calculation works + - PDF is generated successfully + - File size is reasonable + + Note: This test uses a regular workload but validates the PDF structure + can handle the multi-subplot layout properly. + """ + if soc in ("MI100"): + pytest.skip("Skipping roofline test for MI100") + return + + options = ["--device", "0", "--roof-only"] + workload_dir = test_utils.get_output_dir() + + returncode = binary_handler_profile_rocprof_compute( + config, workload_dir, options, check_success=False, roof=True + ) + + assert returncode == 0, "Roofline profiling should succeed" + + pdf_files = list(Path(workload_dir).glob("empirRoof_*.pdf")) + assert len(pdf_files) > 0, "At least one roofline PDF should be generated" + + for pdf_file in pdf_files: + assert pdf_file.exists(), f"PDF file {pdf_file} should exist" + file_size = pdf_file.stat().st_size + + # PDF should be larger than 10KB (has content) but less than 50MB (reasonable) + assert file_size > 10000, ( + f"PDF {pdf_file} too small ({file_size} bytes), may be malformed" + ) + assert file_size < 50000000, ( + f"PDF {pdf_file} too large ({file_size} bytes), may have issues" + ) + + file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels) + assert sorted(list(file_dict.keys())) == ROOF_ONLY_FILES + + test_utils.clean_output_dir(config["cleanup"], workload_dir) + + @pytest.mark.misc def test_device_filter(binary_handler_profile_rocprof_compute): options = ["--device", "0"] From b19f2dc5909a2e17679d0edfed995537222a8c67 Mon Sep 17 00:00:00 2001 From: xuchen-amd Date: Wed, 19 Nov 2025 10:46:02 -0500 Subject: [PATCH 05/10] [rocprof-compute] update yamls for docs (#1887) --- .../docs/data/metrics_description.yaml | 75 +++++---- .../gfx908/0200_system_speed_of_light.yaml | 18 +-- .../gfx908/0300_memory_chart.yaml | 8 +- .../gfx908/0400_roofline.yaml | 26 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx908/0600_workgroup_manager_spi.yaml | 2 +- .../gfx908/0700_wavefront.yaml | 12 +- .../gfx908/1200_local_data_share_lds.yaml | 2 +- .../gfx908/1300_instruction_cache.yaml | 2 +- .../gfx908/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx908/1600_vector_l1_data_cache.yaml | 2 +- .../gfx90a/0200_system_speed_of_light.yaml | 18 +-- .../gfx90a/0300_memory_chart.yaml | 8 +- .../gfx90a/0400_roofline.yaml | 26 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx90a/0600_workgroup_manager_spi.yaml | 2 +- .../gfx90a/0700_wavefront.yaml | 12 +- .../1000_compute_units_instruction_mix.yaml | 2 +- .../1100_compute_units_compute_pipeline.yaml | 14 +- .../gfx90a/1200_local_data_share_lds.yaml | 2 +- .../gfx90a/1300_instruction_cache.yaml | 2 +- .../gfx90a/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx90a/1600_vector_l1_data_cache.yaml | 2 +- .../gfx940/0200_system_speed_of_light.yaml | 18 +-- .../gfx940/0300_memory_chart.yaml | 8 +- .../gfx940/0400_roofline.yaml | 26 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx940/0600_workgroup_manager_spi.yaml | 2 +- .../gfx940/0700_wavefront.yaml | 12 +- .../1000_compute_units_instruction_mix.yaml | 2 +- .../1100_compute_units_compute_pipeline.yaml | 14 +- .../gfx940/1200_local_data_share_lds.yaml | 2 +- .../gfx940/1300_instruction_cache.yaml | 2 +- .../gfx940/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx940/1600_vector_l1_data_cache.yaml | 2 +- .../gfx941/0200_system_speed_of_light.yaml | 18 +-- .../gfx941/0300_memory_chart.yaml | 8 +- .../gfx941/0400_roofline.yaml | 26 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx941/0600_workgroup_manager_spi.yaml | 2 +- .../gfx941/0700_wavefront.yaml | 12 +- .../1000_compute_units_instruction_mix.yaml | 2 +- .../1100_compute_units_compute_pipeline.yaml | 14 +- .../gfx941/1200_local_data_share_lds.yaml | 2 +- .../gfx941/1300_instruction_cache.yaml | 2 +- .../gfx941/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx941/1600_vector_l1_data_cache.yaml | 2 +- .../gfx942/0200_system_speed_of_light.yaml | 18 +-- .../gfx942/0300_memory_chart.yaml | 8 +- .../gfx942/0400_roofline.yaml | 26 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx942/0600_workgroup_manager_spi.yaml | 2 +- .../gfx942/0700_wavefront.yaml | 12 +- .../1000_compute_units_instruction_mix.yaml | 2 +- .../1100_compute_units_compute_pipeline.yaml | 14 +- .../gfx942/1200_local_data_share_lds.yaml | 2 +- .../gfx942/1300_instruction_cache.yaml | 2 +- .../gfx942/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx942/1600_vector_l1_data_cache.yaml | 2 +- .../gfx950/0200_system_speed_of_light.yaml | 18 +-- .../gfx950/0300_memory_chart.yaml | 8 +- .../gfx950/0400_roofline.yaml | 28 ++-- .../0500_command_processor_cpc_cpf.yaml | 2 +- .../gfx950/0600_workgroup_manager_spi.yaml | 2 +- .../gfx950/0700_wavefront.yaml | 12 +- .../1000_compute_units_instruction_mix.yaml | 2 +- .../1100_compute_units_compute_pipeline.yaml | 14 +- .../gfx950/1200_local_data_share_lds.yaml | 2 +- .../gfx950/1300_instruction_cache.yaml | 2 +- .../gfx950/1400_scalar_l1_data_cache.yaml | 6 +- .../gfx950/1600_vector_l1_data_cache.yaml | 2 +- .../gfx950/1700_l2_cache.yaml | 2 +- .../tools/autogen_hash.yaml | 118 +-------------- .../config_management/.config_hashes.json | 142 +++++++++--------- .../tools/config_management/README.md | 4 +- .../metric_description_manager.py | 27 +--- .../tools/config_management/utils.py | 2 +- .../gfx908_metrics_description.yaml | 82 +++++----- .../gfx90a_metrics_description.yaml | 102 ++++++------- .../gfx940_metrics_description.yaml | 106 ++++++------- .../gfx941_metrics_description.yaml | 106 ++++++------- .../gfx942_metrics_description.yaml | 106 ++++++------- .../gfx950_metrics_description.yaml | 110 +++++++------- 83 files changed, 708 insertions(+), 836 deletions(-) diff --git a/projects/rocprofiler-compute/docs/data/metrics_description.yaml b/projects/rocprofiler-compute/docs/data/metrics_description.yaml index 25635f32b8a..4c60cf24a18 100644 --- a/projects/rocprofiler-compute/docs/data/metrics_description.yaml +++ b/projects/rocprofiler-compute/docs/data/metrics_description.yaml @@ -1,7 +1,6 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py Wavefront launch stats: AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. @@ -12,7 +11,7 @@ Wavefront launch stats: total workgroup (or, block) size. unit: Work-Items LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -23,7 +22,7 @@ Wavefront launch stats: `_. unit: Wavefronts SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. @@ -38,14 +37,14 @@ Wavefront launch stats: as well as for register spills and restores. unit: Bytes per work-item Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. @@ -98,14 +97,14 @@ Wavefront runtime stats: rst: The total duration of the executed kernel in cycles. unit: Cycles Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not be directly compared to the kernel cycles above. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -148,7 +147,7 @@ Overall instruction mix: unit: Instructions VALU arithmetic instruction mix: Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -240,7 +239,7 @@ MFMA instruction mix: unit: Instructions per normalization unit Compute Speed-of-Light: MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -248,7 +247,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -256,7 +255,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -264,7 +263,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -276,21 +275,21 @@ Compute Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. unit: GFLOPs VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -361,7 +360,7 @@ Pipeline statistics: unit: Percent Arithmetic operations: BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -388,7 +387,7 @@ Arithmetic operations: `. unit: FLOP per normalization unit INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -460,7 +459,7 @@ LDS Statistics: acknowledgment) required for an LDS instruction to complete. unit: Cycles Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -586,7 +585,7 @@ L1 Unified Translation Cache (UTCL1): per normalization unit. unit: Requests per normalization unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern @@ -925,7 +924,7 @@ L2-Fabric interface metrics: before data was returned to the L2. unit: Cycles Read Stall: - rst: |- + rst: >- The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_ @@ -1198,7 +1197,7 @@ Scalar L1D Cache - L2 Interface: per :ref:`normalization unit `. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1208,7 +1207,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1227,7 +1226,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1286,7 +1285,7 @@ Workgroup manager utilizations: not fully saturated by the kernel, or a potential load-imbalance issue. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -1332,7 +1331,7 @@ Workgroup Manager - Resource Allocation: lack of available :ref:`waveslots `. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -1341,7 +1340,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -1362,7 +1361,7 @@ Workgroup Manager - Resource Allocation: or newer accelerators (and small for previous accelerators). unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -1464,7 +1463,7 @@ System Speed-of-Light: over the total number of incoming cache line requests to the L2 cache. unit: Percent L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -1490,7 +1489,7 @@ System Speed-of-Light: Conflict Rate). unit: Conflicts/Access MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -1498,7 +1497,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1506,7 +1505,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1514,7 +1513,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1522,7 +1521,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -1531,7 +1530,7 @@ System Speed-of-Light: series and later only. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1562,14 +1561,14 @@ System Speed-of-Light: time-averaged over all VALU instructions run on all wavefronts in the kernel. unit: Work-items VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -1590,7 +1589,7 @@ System Speed-of-Light: issuing VMEM instructions over the :ref:`total CU cycles `. unit: Percent Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml index ae059bc0cbd..6fca0c579c0 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml @@ -200,37 +200,37 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -263,7 +263,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -296,7 +296,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml index d817e3e02f2..db190f03ef6 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml @@ -170,15 +170,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -266,7 +266,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml index 6cf7344c4f5..971414d5e7b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml @@ -134,48 +134,48 @@ Panel Config: / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -196,22 +196,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml index 118ce18331c..411c4c803bc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml @@ -141,6 +141,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml index eb9845aa823..ca033cbdd7d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml @@ -168,7 +168,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml index e9e9407cfc2..dfc968df651 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml index b7767fea168..8e602e9f835 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml @@ -140,7 +140,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml index 2be99f875f8..55ba9d1447b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml @@ -436,7 +436,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml index b8bdb7e6647..34eb6972cac 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml @@ -218,37 +218,37 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -281,7 +281,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -314,7 +314,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml index 1fd388cba15..8349f4f8fd8 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml @@ -170,15 +170,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -266,7 +266,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml index 4a8c962f3b4..28089ad5d35 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml @@ -132,48 +132,48 @@ Panel Config: / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -194,22 +194,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml index 118ce18331c..411c4c803bc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml @@ -141,6 +141,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml index eb9845aa823..ca033cbdd7d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml @@ -168,7 +168,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml index e9e9407cfc2..dfc968df651 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml index ecf74736889..e35145781b7 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml @@ -268,7 +268,7 @@ Panel Config: floating-point operands issued to the VALU per normalization unit. F64-Trans: The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. - Conversion: |- + Conversion: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. Global/Generic Instr: The total number of global & generic memory instructions diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml index af4ff8ef773..f7536c00065 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml @@ -237,37 +237,37 @@ Panel Config: max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (INT8): |- + MFMA IOPs (INT8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml index b7767fea168..8e602e9f835 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml @@ -140,7 +140,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml index 2be99f875f8..55ba9d1447b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml @@ -436,7 +436,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml index 8aa72cb25df..5f76eb89372 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml @@ -227,12 +227,12 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from @@ -242,27 +242,27 @@ Panel Config: from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -295,7 +295,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -328,7 +328,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml index b13053c1f72..81ce3c2e684 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml @@ -162,15 +162,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -252,7 +252,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml index 6731ebfceb3..f32f4fa7d8d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml @@ -140,17 +140,17 @@ Panel Config: * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations @@ -160,33 +160,33 @@ Panel Config: from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -207,22 +207,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml index 118ce18331c..411c4c803bc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml @@ -141,6 +141,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml index eb9845aa823..ca033cbdd7d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml @@ -168,7 +168,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml index e9e9407cfc2..dfc968df651 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml index 768fe6548b8..823f543182f 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml @@ -273,7 +273,7 @@ Panel Config: floating-point operands issued to the VALU per normalization unit. F64-Trans: The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. - Conversion: |- + Conversion: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. Global/Generic Instr: The total number of global & generic memory instructions diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml index 5e6ceb654f7..d9a4abced9d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml @@ -251,37 +251,37 @@ Panel Config: max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (INT8): |- + MFMA IOPs (INT8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml index b7767fea168..8e602e9f835 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml @@ -140,7 +140,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml index 3125397a30b..9111e859219 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml @@ -398,7 +398,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml index 7943f891b1b..d0efd3e600b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml @@ -227,12 +227,12 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from @@ -242,27 +242,27 @@ Panel Config: from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -295,7 +295,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -328,7 +328,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml index b13053c1f72..81ce3c2e684 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml @@ -162,15 +162,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -252,7 +252,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml index 536938f7004..d24eaedeaed 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml @@ -140,17 +140,17 @@ Panel Config: * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations @@ -160,33 +160,33 @@ Panel Config: from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -207,22 +207,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml index 118ce18331c..411c4c803bc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml @@ -141,6 +141,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml index eb9845aa823..ca033cbdd7d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml @@ -168,7 +168,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml index e9e9407cfc2..dfc968df651 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml index 768fe6548b8..823f543182f 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml @@ -273,7 +273,7 @@ Panel Config: floating-point operands issued to the VALU per normalization unit. F64-Trans: The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. - Conversion: |- + Conversion: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. Global/Generic Instr: The total number of global & generic memory instructions diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml index 5e6ceb654f7..d9a4abced9d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml @@ -251,37 +251,37 @@ Panel Config: max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (INT8): |- + MFMA IOPs (INT8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml index b7767fea168..8e602e9f835 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml @@ -140,7 +140,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml index 3125397a30b..9111e859219 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml @@ -398,7 +398,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml index 8aa72cb25df..5f76eb89372 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml @@ -227,12 +227,12 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from @@ -242,27 +242,27 @@ Panel Config: from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -295,7 +295,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -328,7 +328,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml index b13053c1f72..81ce3c2e684 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml @@ -162,15 +162,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -252,7 +252,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml index fe6389ef3b9..6d3980f2230 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml @@ -140,17 +140,17 @@ Panel Config: * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations @@ -160,33 +160,33 @@ Panel Config: from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -207,22 +207,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml index 118ce18331c..411c4c803bc 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml @@ -141,6 +141,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml index eb9845aa823..ca033cbdd7d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml @@ -168,7 +168,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml index e9e9407cfc2..dfc968df651 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml index 768fe6548b8..823f543182f 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml @@ -273,7 +273,7 @@ Panel Config: floating-point operands issued to the VALU per normalization unit. F64-Trans: The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. - Conversion: |- + Conversion: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. Global/Generic Instr: The total number of global & generic memory instructions diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml index 5e6ceb654f7..d9a4abced9d 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml @@ -251,37 +251,37 @@ Panel Config: max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (INT8): |- + MFMA IOPs (INT8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml index b7767fea168..8e602e9f835 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml @@ -140,7 +140,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml index 3125397a30b..9111e859219 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml @@ -398,7 +398,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml index bdbd62f755e..225c5cf1494 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml @@ -233,12 +233,12 @@ Panel Config: pop: None coll_level: SQ_IFETCH_LEVEL metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from @@ -248,27 +248,27 @@ Panel Config: from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations @@ -301,7 +301,7 @@ Panel Config: IPC: The ratio of the total number of instructions executed on the CU over the total active CU cycles. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -334,7 +334,7 @@ Panel Config: if only a single value is requested in a cache line, the data movement will still be counted as a full cache line. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. - L2-Fabric Read BW: |- + L2-Fabric Read BW: >- The number of bytes read by the L2 over the Infinity Fabric\u2122 interface per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml index 081d5654dfa..9d3e26ebb20 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml @@ -172,15 +172,15 @@ Panel Config: Active CUs: Total number of active compute units (CUs) on the accelerator during the kernel execution. Num CUs: Total number of compute units (CUs) on the accelerator. - VGPR: |- + VGPR: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - SGPR: |- + SGPR: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -268,7 +268,7 @@ Panel Config: or data (atomic with return value) was returned to the L2. HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data from the accelerator's local HBM, per normalization unit. - HBM Wr: |- + HBM Wr: >- The total number of L2 requests to Infinity Fabric to write or atomically update 32B or 64B of data in the accelerator's local HBM, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml index 0a4b3be6939..83240870e8e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml @@ -148,17 +148,17 @@ Panel Config: Start_Timestamp) / 1e9) ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs (F16): |- + VALU FLOPs (F16): >- The total 16-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from MFMA instructions. - VALU FLOPs (F32): |- + VALU FLOPs (F32): >- The total 32-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from MFMA instructions. - VALU FLOPs (F64): |- + VALU FLOPs (F64): >- The total 64-bit floating-point operations executed per second on the VALU. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations @@ -168,39 +168,39 @@ Panel Config: from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured BF16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. The peak empirically measured F16 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. The peak empirically measured F32 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - MFMA FLOPs (F6F4): |- + MFMA FLOPs (F6F4): >- The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. - MFMA IOPs (Int8): |- + MFMA IOPs (Int8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. - HBM Bandwidth: |- + HBM Bandwidth: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -221,22 +221,22 @@ Panel Config: from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth example for more detail). The peak empirically measured LDS bandwidth achievable on the specific accelerator is displayed alongside for comparison. - AI L1: |- + AI L1: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. - AI L2: |- + AI L2: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. - AI HBM: |- + AI HBM: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. - Performance (GFLOPs): |- + Performance (GFLOPs): >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml index 58699ebb188..c4b9f8ad901 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml @@ -162,6 +162,6 @@ Panel Config: the CPC-L2 interface was active doing any work. CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address translation - CPC-UTCL2 Utilization: |- + CPC-UTCL2 Utilization: >- Percent of total cycles counted by the CPC's L2 address translation interface where the CPC was busy doing address translation work. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml index 02ed4b3d9ca..7b2b0176749 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml @@ -204,7 +204,7 @@ Panel Config: in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the workgroup manager rather than a lack of a CU or SIMD with sufficient resources. - Not-scheduled Rate (Scheduler-Pipe): |- + Not-scheduled Rate (Scheduler-Pipe): >- The percent of total scheduler-pipe cycles in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient resources. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml index bd6ca386421..3ac9d30c793 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml @@ -121,26 +121,26 @@ Panel Config: Workgroup Size: The total number of work-items (or, threads) in each workgroup (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent to the total block size. - Total Wavefronts: |- + Total Wavefronts: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should be equivalent to the ceiling of grid size divided by 64. Saved Wavefronts: The total number of wavefronts saved at a context-save. Restored Wavefronts: The total number of wavefronts restored from a context-save. - VGPRs: |- + VGPRs: >- The number of architected vector general-purpose registers allocated for the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. - AGPRs: |- + AGPRs: >- The number of accumulation vector general-purpose registers allocated for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. - SGPRs: |- + SGPRs: >- The number of scalar general-purpose registers allocated for the kernel, see SALU. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. - LDS Allocation: |- + LDS Allocation: >- The number of bytes of LDS memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch LDS allocations. @@ -173,7 +173,7 @@ Panel Config: rather than identification of a precise limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles metric. - Wavefront Occupancy: |- + Wavefront Occupancy: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml index 551dad2bc2d..fc1efd22245 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml @@ -283,7 +283,7 @@ Panel Config: floating-point operands issued to the VALU per normalization unit. F64-Trans: The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. - Conversion: |- + Conversion: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. Global/Generic Instr: The total number of global & generic memory instructions diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml index dc361646674..3fd0dfd5681 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml @@ -267,37 +267,37 @@ Panel Config: max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) unit: (OPs + $normUnit) metrics_description: - VALU FLOPs: |- + VALU FLOPs: >- The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. - VALU IOPs: |- + VALU IOPs: >- The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. - MFMA FLOPs (BF16): |- + MFMA FLOPs (BF16): >- The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F16): |- + MFMA FLOPs (F16): >- The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F32): |- + MFMA FLOPs (F32): >- The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. - MFMA FLOPs (F64): |- + MFMA FLOPs (F64): >- The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - MFMA IOPs (INT8): |- + MFMA IOPs (INT8): >- The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml index 7b839fc1f72..f186a85d0c2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml @@ -180,7 +180,7 @@ Panel Config: unit. Unaligned Stall: The total number of cycles spent in the LDS scheduler due to stalls from non-dword aligned addresses per normalization unit. - Mem Violations: |- + Mem Violations: >- The total number of out-of-bounds accesses made to the LDS, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml index 35808d9d960..ffc6e890b9b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml @@ -92,7 +92,7 @@ Panel Config: Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line the cache. Calculated as the ratio of the number of L1I requests that hit over the number of all L1I requests. - L1I-L2 Bandwidth Utilization: |- + L1I-L2 Bandwidth Utilization: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the total L1I-L2 interface cycles. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml index 6b731648480..be2ce0db795 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml @@ -154,7 +154,7 @@ Panel Config: sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. Calculated as total number of bytes read from, written to, or atomically updated across the sL1D - L2 interface. - sL1D-L2 BW: |- + sL1D-L2 BW: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D writes and atomics are typically unused on current CDNA accelerators, so @@ -164,7 +164,7 @@ Panel Config: unit. Hits: The total number of sL1D requests that hit on a previously loaded cache line, per normalization unit. - Misses - Non Duplicated: |- + Misses - Non Duplicated: >- The total number of sL1D requests that missed on a cache line that was not already pending due to another request, per normalization unit. Misses- Duplicated: The total number of sL1D requests that missed on a cache line @@ -187,6 +187,6 @@ Panel Config: unit. Write Req: The total number of write requests from sL1D to the L2, per normalization unit. Typically unused on current CDNA accelerators. - Stall Cycles: |- + Stall Cycles: >- The total number of cycles the sL1D\u2194L2 interface was stalled, per normalization unit. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml index 48408d16d7b..0589588fbdd 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml @@ -501,7 +501,7 @@ Panel Config: per normalization unit. Translation Misses: The total number of translation requests that missed in the UTCL1 due to translation not being present in the cache, per normalization unit. - Permission Misses: |- + Permission Misses: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per normalization unit. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml index 40cbd3856fd..d4fde58501a 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml @@ -706,7 +706,7 @@ Panel Config: requests are only considered atomic by Infinity Fabric if they are targeted at non-write-cacheable memory, such as fine-grained memory allocations or uncached memory allocations on the MI2XX. - Read Stall: |- + Read Stall: >- The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator diff --git a/projects/rocprofiler-compute/tools/autogen_hash.yaml b/projects/rocprofiler-compute/tools/autogen_hash.yaml index 0b3955ff1ac..e25b0bb4f98 100644 --- a/projects/rocprofiler-compute/tools/autogen_hash.yaml +++ b/projects/rocprofiler-compute/tools/autogen_hash.yaml @@ -1,116 +1,2 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py -src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d -src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb -src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: aa60b7a75e46196195675a1c8d6aa65211483ace8dfe346ed0228056586bc8a5 -src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: 54d0ef58f8222463516984d3b9153806f5185de9e719d1903537af4c8344a4f4 -src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e -src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: 352d4702fbebd8550883b777b875893a8404a7909d83c74cdd50c1b713452c81 -src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e -src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: 1a164dfbb551e4b0a8a55a843d776738d90406cdbe2930e0f474b77a075a7353 -src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: ff5fd164694f454a95ccd52c8c0bfa20aebfa476908cab2ac03215fb33e48598 -src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: 332c1965f462e75a479ddf3270294e1cf723701eb08b60c6cea550eb3bc192e7 -src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8 -src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8 -src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8 -src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: d3a2e085061068ff8cff0b80f6944dc866ec3e748cf1e4c0cfcd76e1e14d21f8 -src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: e91988af6d99a03e2a19593155447f79abe64dc128a83a170a5037ab466b238c -src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 0807c87d20faed19f2ef9470e9277715f2287e687aa831a328dcab4915a38812 -src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: f5f35d1ae9a35fe83bcdf572aa788401c14cc6718761c4cf8e4dddcf249c3548 -src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 760ecef9947fa31d3a0fb5c45d653060d06213d8d9f216c19cbb1b1ce29942b6 -src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: e037ce1a2cf8ba08e2317e322b56954caace6ec2427a966acbabf2135cd89855 -src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: f53b2a92b3ac051290eff9b1f63343c30e6cd223b9cbf9d30a93ef4a5ff158b3 -src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966 -src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966 -src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966 -src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966 -src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966 -src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml: 1e4c1bc1158398df8966d24e56b7d434458ce10ade9e13f168887d9a0d9abaef -src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e -src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e -src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e -src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e -src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e -src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml: 6d97f3ebf3bef1d164255d4c4979e43d7f313f1eda067324aad9be06be98f090 -src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b -src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b -src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b -src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b -src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b -src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml: a6012921ec2e5984861d34ebfca416703b00f3b2cd4cb07541378a285a58b778 -src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml: 82ef2f27395f2887d1385a33b1d4bcb7cb646ece11146fe1238af2a2fc49108f -src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml: e58c1dff540e06ec3021ae4e852cec5a116e978f00f3e0902b74b5d86f1b88ac -src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09 -src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09 -src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09 -src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml: a0fe88305b0972c0702e542558c0d491eac26438577660e58817e988b7b1f0d4 -src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml: e815205890d9c815f7f53cdaa64eeef6219bce83054b92fa2be25e240093bdb0 -src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml: b44f500ee07856ec8c59afa1ebb0a204d8b5f3247a43725ba16782484fef6ad1 -src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c -src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c -src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c -src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml: 4797cd3052fdb37278aa9a28572287c1a9a7228f05a77ce22c0eb4786cbbd404 -src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c -src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c -src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c -src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c -src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c -src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml: 35c98741e9b5afd2f7638d2675b22138f5854168e15bc4633112857ed94edbc1 -src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a -src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75 -src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml: b98a800c31da0275704e076e561468dccdaf0b8bff1cc8d74a4e6bf9c7be2973 -src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 58834a04fc4fb6f9eb648a6b8944f737ce4a8c9d4a6c5f75104d9fd528f520a6 -src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8 -src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8 -src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8 -src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 5c6555a93b01c057f01e0b0cef3169eeb324ca8c256c42f5f9fc0d1ea131486b -src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704 -src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704 -src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a -src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a -src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a -src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: 3cec51c5a848c4f513c4c0a74aa35a5657289148a67179f8db4ea3e55bdb6ac3 -src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: e37693ef03caf3d77ae7b91c3c166d033fa0732880cc50a21b8c06a4e79b1f38 -src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: 3314a1e473b1cfc95b742b1a8cfbc47d4602061ca89d7a4ac89ea7cc15908962 -src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: cb8922a41dd2088e8e2b0c1e82c7b95fa55304cf90435b217da128234805d77a -src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 2187f141480a2c57b271ded46255735510de5197441de830cf1efa9345e5566a -src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: 7ce34989a66b8f8750cf1bf76f5cdaf59bf662a7205355f6fe12cace796d4ceb -src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: a3a8db0f555cd1069a61dfc3b89df83e9423d4a0200f1401c7612942ff75152e -src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: fd32454bf9f0d3027c77a85ea6be308e92f6815d0ea732c6bafacc8e0f32a25f -src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: 23e9a258ab541d24d29cde2237f9445db695e7a4d17d5974cb4fd5ff9a9869c0 -src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33 -src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33 -src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33 -src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml: b6336ab78a97fb9750e2f925893a5acc4e66e43ac60472c20225e56c440983d7 -src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36 -src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml: ee28989e70d0537db8b0f0a4bc5499444b44ff0e73d3e7f2926943be11d0aeda -src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml: 9c9533174a3f7bd5c8e09ec998743c7bb2642c4ce3f818b546673be9cafc40a8 -src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242 -src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242 -src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242 -src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml: 238d9dc8a98cfead3fc904885bfe413e5bcb4f1af31e9820cd640388bcd1e1c2 -docs/data/metrics_description.yaml: 12164b43dab4a1088f90763a80ffc8feb38aa82fd7b767edf8f65bd304f22162 +# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from tools/unified_config.yaml. Generated by tools/split_config.py +{} diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json index 5989df1edff..64e23763482 100644 --- a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json +++ b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json @@ -5,19 +5,19 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "c54676a8a385c02be50fcf09a721bef6", - "0300_memory_chart.yaml": "f952fe7de6d86cb22f6f8ce34867905f", - "0400_roofline.yaml": "02ca6cf3583f2718ab371bbbfdd8cfef", - "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6", - "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10", - "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778", + "0200_system_speed_of_light.yaml": "c4878ac57b7b7b4b5711672cb2f6dffc", + "0300_memory_chart.yaml": "221c6d2bb50a4f4177585b9988f88c7b", + "0400_roofline.yaml": "bad8d851694ff9a140e29a148a35fa50", + "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", + "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", + "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028", "1000_compute_units_instruction_mix.yaml": "e96eccdcb0e5d28b292107c0f68ec845", "1100_compute_units_compute_pipeline.yaml": "8f61973d0d08bf49895b5dfe32d05c09", - "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "645eb10a440eed62c6250a0f5a2407f3", - "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e", + "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969", "1700_l2_cache.yaml": "38e7db4c404007c471864251dff30570", "1800_l2_cache_per_channel.yaml": "7193043cd8eee47501cd8c0ae02b51e9", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" @@ -28,19 +28,19 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "747b14ab50dd4d7689af7c268569b32a", - "0300_memory_chart.yaml": "0d6d094ad24cebf6e583e643beaae06e", - "0400_roofline.yaml": "632b16e1d251e57de0cf7237d3a89766", - "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6", - "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10", - "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778", - "1000_compute_units_instruction_mix.yaml": "af6304cce1fe38c119b1d17fa635265c", - "1100_compute_units_compute_pipeline.yaml": "c38ece6032d757f394c83ad9f93e0dce", - "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "0200_system_speed_of_light.yaml": "dc6a6e1a8513e2d32aecc055a958c639", + "0300_memory_chart.yaml": "a61f219fe063c4c4b0b9cbaf96389a8b", + "0400_roofline.yaml": "da1d514ed19ca2466c167e983bdb4f13", + "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", + "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", + "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028", + "1000_compute_units_instruction_mix.yaml": "84bd6a22a29335a4851bba675614e103", + "1100_compute_units_compute_pipeline.yaml": "39429cd6af68f91f1b20630c1bab8cc7", + "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "8005b28532601a759ace2f653d10da56", - "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e", + "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969", "1700_l2_cache.yaml": "1630ae8fc504ea056e91bb19909d5629", "1800_l2_cache_per_channel.yaml": "5ee4fd9c849670c301c4afee257acddd", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" @@ -51,19 +51,19 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951", - "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff", - "0400_roofline.yaml": "1f3888778245e7eb05e769bda605588a", - "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6", - "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10", - "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778", - "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25", - "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7", - "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd", + "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0400_roofline.yaml": "d4650e008f2e3a7d28871e8518153575", + "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", + "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", + "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028", + "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02", + "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74", + "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", - "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7", + "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", "1700_l2_cache.yaml": "0987e21ac2547134fea87499dee01847", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" @@ -74,19 +74,19 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "7ed2ceba47e232b4e39431228a254f7f", - "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff", - "0400_roofline.yaml": "a80de496435c2c76eb4cfdc38d62155f", - "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6", - "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10", - "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778", - "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25", - "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7", - "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "0200_system_speed_of_light.yaml": "0ddeaefd245291c7f88674431efd74f6", + "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0400_roofline.yaml": "c066a19bc0e00e692c34998e44c62387", + "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", + "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", + "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028", + "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02", + "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74", + "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", - "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7", + "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", "1700_l2_cache.yaml": "05a86637744ad66f6491620c4ad659d2", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" @@ -97,19 +97,19 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951", - "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff", - "0400_roofline.yaml": "f94c87dad18f87e5582566276a5c0cfc", - "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6", - "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10", - "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778", - "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25", - "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7", - "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd", + "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0400_roofline.yaml": "318c3e774d41a639628a7f72c2462375", + "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", + "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", + "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028", + "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02", + "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74", + "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca", - "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7", + "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae", "1700_l2_cache.yaml": "96e49399b26d00d88ad534a35c95304b", "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" @@ -120,20 +120,20 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "4a215bccc9378583a6e7e7733b601537", - "0300_memory_chart.yaml": "f19548711a687779df0c0b87a1df7a27", - "0400_roofline.yaml": "156c1a1d7a6c1e55aea25552334a84d5", - "0500_command_processor_cpc_cpf.yaml": "5b67ff80efbc2e1dffb7e3922499ca88", - "0600_workgroup_manager_spi.yaml": "63a7b6f7a4487fb87d67549214e08aac", - "0700_wavefront.yaml": "1ecfc3a91ec0cce6ed9eb94afae17aa9", - "1000_compute_units_instruction_mix.yaml": "7088fafcaa66a8ec48a9d3939cd7339a", - "1100_compute_units_compute_pipeline.yaml": "fce707e3f419ee2708676c8f7c325df5", - "1200_local_data_share_lds.yaml": "06bee89ddab210dbd122eaaedef0b29a", - "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74", - "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9", + "0200_system_speed_of_light.yaml": "a5ee49ce96bfab87128c856c827db870", + "0300_memory_chart.yaml": "e2401641a8f280fda308f87e5ad243df", + "0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9", + "0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0", + "0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670", + "0700_wavefront.yaml": "330468fd711057b422de9b952c5cfe69", + "1000_compute_units_instruction_mix.yaml": "c8bbdde1f29c9548a8e0ed7fcdd9ae04", + "1100_compute_units_compute_pipeline.yaml": "30e64960bbac4cc5626615a60240bd5f", + "1200_local_data_share_lds.yaml": "0e57c559dbcd5526e2e8006a47a69f4b", + "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d", + "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599", "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "355a0c6b9b113fcfb686a300b78be21a", - "1600_vector_l1_data_cache.yaml": "68382e45c7a3c578df861d6285024803", - "1700_l2_cache.yaml": "f70f23b93e97b99327b5db3907eb133e", + "1600_vector_l1_data_cache.yaml": "689aba850739a9cbd64ce1e816e95dff", + "1700_l2_cache.yaml": "067f8c8a7264762fdc58a41728b4382b", "1800_l2_cache_per_channel.yaml": "7e2a1809a9b7f70a088068d6689c8aa4", "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b" } diff --git a/projects/rocprofiler-compute/tools/config_management/README.md b/projects/rocprofiler-compute/tools/config_management/README.md index 3677bdf2c66..d19ecf5e95a 100644 --- a/projects/rocprofiler-compute/tools/config_management/README.md +++ b/projects/rocprofiler-compute/tools/config_management/README.md @@ -97,7 +97,7 @@ Addition: metric_descriptions: New Metric: plain: Description text - rst: |- # Optional + rst: >- # Optional Description with :ref:`RST markup ` Deletion: @@ -231,7 +231,7 @@ Modification: metric_descriptions: Existing Metric: plain: Updated description - rst: |- + rst: >- Updated description with **RST**" ``` diff --git a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py index 6c197c89d55..3fa75d5122a 100644 --- a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py +++ b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py @@ -114,11 +114,9 @@ def merge_docs_rst_as_default(descs: dict, docs_file: Path) -> dict: for section, metrics in descs.items(): docs_section = docs.get(section) or {} for metric_name, d in metrics.items(): - # If panel didn't explicitly provide rst, inherit from docs - if not d.get("rst"): - doc_entry = docs_section.get(metric_name) or {} - if doc_entry.get("rst"): - d["rst"] = doc_entry["rst"] + doc_entry = docs_section.get(metric_name) or {} + if doc_entry.get("rst"): + d["rst"] = doc_entry["rst"] return descs @@ -129,10 +127,6 @@ def merge_units_as_default(descs: dict, docs_file: Path, per_arch_file: Path) -> 2) else from docs file, 3) else leave as-is (missing). """ - per_arch: dict = {} - if per_arch_file.exists(): - with open(per_arch_file, "r", encoding="utf-8") as f: - per_arch = yaml.safe_load(f) or {} docs: dict = {} if docs_file.exists(): @@ -140,18 +134,11 @@ def merge_units_as_default(descs: dict, docs_file: Path, per_arch_file: Path) -> docs = yaml.safe_load(f) or {} for section, metrics in descs.items(): - psec = per_arch.get(section) or {} dsec = docs.get(section) or {} for metric, data in metrics.items(): - # Only fill if panel did NOT explicitly set unit - if "unit" not in data or data["unit"] is None: - unit = None - if metric in psec and isinstance(psec[metric], dict): - unit = psec[metric].get("unit") - if unit is None and metric in dsec and isinstance(dsec[metric], dict): - unit = dsec[metric].get("unit") - if unit is not None: - data["unit"] = unit + doc_entry = dsec.get(metric) + if doc_entry and "unit" in doc_entry: + data["unit"] = doc_entry["unit"] return descs @@ -403,7 +390,7 @@ def sync_arch( update_per_arch_metrics_file(arch_name, descriptions, per_arch_metrics_dir) # 5) Only when latest: update docs, but overwrite 'rst' only for overrides - if is_latest: + if is_latest and (panel_rst_overrides or panel_unit_overrides): if not update_docs_metrics_file( descriptions, docs_metrics_file, diff --git a/projects/rocprofiler-compute/tools/config_management/utils.py b/projects/rocprofiler-compute/tools/config_management/utils.py index 0af6e5acebe..d79d65a2574 100644 --- a/projects/rocprofiler-compute/tools/config_management/utils.py +++ b/projects/rocprofiler-compute/tools/config_management/utils.py @@ -31,7 +31,7 @@ def str_representer(dumper, data): if "\n" in data: - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">") return dumper.represent_scalar("tag:yaml.org,2002:str", data) diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml index 0fd8a4b262e..10e5304b9cb 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -22,7 +22,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -30,7 +30,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -38,7 +38,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -46,7 +46,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -99,7 +99,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -144,7 +144,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -229,19 +229,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -420,28 +420,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -449,7 +449,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -457,7 +457,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -465,7 +465,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -473,7 +473,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -481,7 +481,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -512,28 +512,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -591,7 +591,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -629,7 +629,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -638,7 +638,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -647,7 +647,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -711,7 +711,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -726,25 +726,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -767,7 +767,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -805,7 +805,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -976,7 +976,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -993,7 +993,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1100,7 +1100,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1122,7 +1122,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1432,7 +1432,7 @@ L1 Unified Translation Cache (UTCL1): translation not being present in the cache, per :ref:`normalization unit `. unit: unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml index 39e2a526646..828289ee98d 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -22,7 +22,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -30,7 +30,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -38,7 +38,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -46,7 +46,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -99,7 +99,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -144,7 +144,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -229,19 +229,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -420,28 +420,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -449,7 +449,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -457,7 +457,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -465,7 +465,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -473,7 +473,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -481,7 +481,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -512,28 +512,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -591,7 +591,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -629,7 +629,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -638,7 +638,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -647,7 +647,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -711,7 +711,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -726,25 +726,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -767,7 +767,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -805,7 +805,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -907,7 +907,7 @@ VALU Arithmetic Instruction Mix: unit `. unit: Instructions per normalization unit Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -976,21 +976,21 @@ MFMA Arithmetic Instruction Mix: unit: Instructions per normalization unit Compute Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GIOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -998,7 +998,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1006,7 +1006,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1014,7 +1014,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1026,7 +1026,7 @@ Compute Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1112,7 +1112,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -1129,7 +1129,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -1206,7 +1206,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -1223,7 +1223,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1330,7 +1330,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1352,7 +1352,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1676,7 +1676,7 @@ L1 Unified Translation Cache (UTCL1): translation not being present in the cache, per :ref:`normalization unit `. unit: unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml index 5a978ccee55..f208484a66a 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -23,7 +23,7 @@ System Speed-of-Light: series and later only. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -31,7 +31,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -39,7 +39,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -47,7 +47,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -55,7 +55,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -108,7 +108,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -153,7 +153,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -238,19 +238,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -419,28 +419,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -448,7 +448,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -456,7 +456,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -464,7 +464,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -472,7 +472,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -481,7 +481,7 @@ Roofline Performance Rates: Instinct MI300 series and later only. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -489,7 +489,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -520,28 +520,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -599,7 +599,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -637,7 +637,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -719,7 +719,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -734,25 +734,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -775,7 +775,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -813,7 +813,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix: unit `. unit: Instructions per normalization unit Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix: unit: Instructions per normalization unit Compute Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -1006,7 +1006,7 @@ Compute Speed-of-Light: rst: '' unit: Unknown MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -1014,7 +1014,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1022,7 +1022,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1030,7 +1030,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1042,7 +1042,7 @@ Compute Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1131,7 +1131,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -1148,7 +1148,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -1225,7 +1225,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -1242,7 +1242,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1349,7 +1349,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1): translation not being present in the cache, per :ref:`normalization unit `. unit: unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml index 5a978ccee55..f208484a66a 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -23,7 +23,7 @@ System Speed-of-Light: series and later only. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -31,7 +31,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -39,7 +39,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -47,7 +47,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -55,7 +55,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -108,7 +108,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -153,7 +153,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -238,19 +238,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -419,28 +419,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -448,7 +448,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -456,7 +456,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -464,7 +464,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -472,7 +472,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -481,7 +481,7 @@ Roofline Performance Rates: Instinct MI300 series and later only. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -489,7 +489,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -520,28 +520,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -599,7 +599,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -637,7 +637,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -719,7 +719,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -734,25 +734,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -775,7 +775,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -813,7 +813,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix: unit `. unit: Instructions per normalization unit Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix: unit: Instructions per normalization unit Compute Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -1006,7 +1006,7 @@ Compute Speed-of-Light: rst: '' unit: Unknown MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -1014,7 +1014,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1022,7 +1022,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1030,7 +1030,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1042,7 +1042,7 @@ Compute Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1131,7 +1131,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -1148,7 +1148,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -1225,7 +1225,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -1242,7 +1242,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1349,7 +1349,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1): translation not being present in the cache, per :ref:`normalization unit `. unit: unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml index 4d27ec667aa..be9b4719e06 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -23,7 +23,7 @@ System Speed-of-Light: series and later only. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -31,7 +31,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -39,7 +39,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -47,7 +47,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -55,7 +55,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -108,7 +108,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -153,7 +153,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -238,19 +238,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -419,28 +419,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -448,7 +448,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -456,7 +456,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -464,7 +464,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -472,7 +472,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -481,7 +481,7 @@ Roofline Performance Rates: Instinct MI300 series and later only. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -489,7 +489,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -520,28 +520,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -599,7 +599,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -637,7 +637,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -719,7 +719,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -734,25 +734,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -775,7 +775,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -813,7 +813,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix: unit `. unit: Instructions per normalization unit Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix: unit: Instructions per normalization unit Compute Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -1006,7 +1006,7 @@ Compute Speed-of-Light: rst: '' unit: Unknown MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -1014,7 +1014,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1022,7 +1022,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1030,7 +1030,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1042,7 +1042,7 @@ Compute Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1131,7 +1131,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -1148,7 +1148,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -1225,7 +1225,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -1242,7 +1242,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1349,7 +1349,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1): translation not being present in the cache, per :ref:`normalization unit `. unit: unit Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml index d0a1898da58..f2fb71f054e 100644 --- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml +++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml @@ -1,20 +1,20 @@ System Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from :ref:`MFMA ` instructions. unit: GOIPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -23,7 +23,7 @@ System Speed-of-Light: series and later only. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This @@ -31,7 +31,7 @@ System Speed-of-Light: achievable on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -39,7 +39,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -47,7 +47,7 @@ System Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -58,7 +58,7 @@ System Speed-of-Light: rst: '' unit: Unknown MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -111,7 +111,7 @@ System Speed-of-Light: over the :ref:`total active CU cycles `. unit: Instructions per-cycle Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). This is also presented as a percent of the peak theoretical @@ -156,7 +156,7 @@ System Speed-of-Light: peak theoretical bandwidth achievable on the specific accelerator. unit: GB/s L2-Fabric Read BW: - rst: |- + rst: >- The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 interface ` per unit time. This is also presented as a percent of the peak theoretical bandwidth achievable on the specific accelerator. @@ -241,19 +241,19 @@ Memory Chart: rst: Total number of compute units (CUs) on the accelerator. unit: CUs VGPR: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs SGPR: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -432,28 +432,28 @@ Memory Chart: unit: Requests per normalization unit Roofline Performance Rates: VALU FLOPs (F16): - rst: |- + rst: >- The total 16-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F16 FLOPs achievable on the specific accelerator. Note: this does not include any F16 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F32): - rst: |- + rst: >- The total 32-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F32 FLOPs achievable on the specific accelerator. Note: this does not include any F32 operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU FLOPs (F64): - rst: |- + rst: >- The total 64-bit floating-point operations executed per second on the :ref:`VALU `. This is presented with the value of the peak empirical F64 FLOPs achievable on the specific accelerator. Note: this does not include any F64 operations from :ref:`MFMA ` instructions. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -461,7 +461,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -469,7 +469,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -477,7 +477,7 @@ Roofline Performance Rates: displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -485,7 +485,7 @@ Roofline Performance Rates: accelerator is displayed alongside for comparison. unit: GFLOPs MFMA FLOPs (F8): - rst: |- + rst: >- The total number of 8-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. The @@ -494,7 +494,7 @@ Roofline Performance Rates: Instinct MI300 series and later only. unit: GFLOPs MFMA FLOPs (F6F4): - rst: |- + rst: >- The total number of 4-bit and 6-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any floating point operations from :ref:`VALU ` instructions. The peak empirically @@ -503,7 +503,7 @@ Roofline Performance Rates: series (gfx950) and later only. unit: GFLOPs MFMA IOPs (Int8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. The peak empirically measured INT8 MFMA @@ -511,7 +511,7 @@ Roofline Performance Rates: for comparison. unit: GIOPs HBM Bandwidth: - rst: |- + rst: >- The total number of bytes read from and written to High-Bandwidth Memory (HBM) per second. The peak empirically measured bandwidth achievable on the specific accelerator is displayed alongside for comparison. @@ -542,28 +542,28 @@ Roofline Performance Rates: unit: GB/s Roofline Plot Points: AI HBM: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between HBM and the L2 cache. This value is used as the x-coordinate for the HBM roofline. unit: FLOPs/Byte AI L2: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L2 cache and the L1 cache. This value is used as the x-coordinate for the L2 roofline. unit: FLOPs/Byte AI L1: - rst: |- + rst: >- The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio of total floating-point operations (FLOPs) to total bytes transferred between the L1 cache and the processing units. This value is used as the x-coordinate for the L1 roofline. unit: FLOPs/Byte Performance (GFLOPs): - rst: |- + rst: >- The overall achieved performance, measured in GigaFLOPs per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point operations divided by the total execution time. This value is used as the y-coordinate @@ -633,7 +633,7 @@ Workgroup manager utilizations: any work. unit: Percent Scheduler-Pipe Utilization: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where the scheduler-pipes were actively doing any work. Note: this value is expected to range between 0% and 25%. See :ref:`desc-spi`. @@ -674,7 +674,7 @@ Workgroup manager utilizations: unit: Cycles/wave Workgroup Manager - Resource Allocation: Not-scheduled Rate (Workgroup Manager): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the workgroup manager rather than a lack of a @@ -683,7 +683,7 @@ Workgroup Manager - Resource Allocation: description. unit: Percent Not-scheduled Rate (Scheduler-Pipe): - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to a bottleneck within the scheduler-pipes rather than a lack of a CU @@ -695,7 +695,7 @@ Workgroup Manager - Resource Allocation: rst: '' unit: Unknown Scheduler-Pipe Stall Rate: - rst: |- + rst: >- The percent of :ref:`total scheduler-pipe cycles ` in the kernel where a workgroup could not be scheduled to a :doc:`CU ` due to occupancy limitations (like a lack of a CU or :ref:`SIMD ` @@ -759,7 +759,7 @@ Wavefront Launch Stats: block size. unit: Work-Items Total Wavefronts: - rst: |- + rst: >- The total number of wavefronts launched as part of the kernel dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront size is always 64 work-items. Thus, the total number of wavefronts should @@ -774,25 +774,25 @@ Wavefront Launch Stats: `_. unit: Wavefronts VGPRs: - rst: |- + rst: >- The number of architected vector general-purpose registers allocated for the kernel, see :ref:`VALU `. Note: this may not exactly match the number of VGPRs requested by the compiler due to allocation granularity. unit: VGPRs AGPRs: - rst: |- + rst: >- The number of accumulation vector general-purpose registers allocated for the kernel, see :ref:`AGPRs `. Note: this may not exactly match the number of AGPRs requested by the compiler due to allocation granularity. unit: AGPRs SGPRs: - rst: |- + rst: >- The number of scalar general-purpose registers allocated for the kernel, see :ref:`SALU `. Note: this may not exactly match the number of SGPRs requested by the compiler due to allocation granularity. unit: SGPRs LDS Allocation: - rst: |- + rst: >- The number of bytes of :doc:`LDS ` memory (or, shared memory) allocated for this kernel. Note: This may also be larger than what was requested at compile time due to both allocation granularity and dynamic per-dispatch @@ -815,7 +815,7 @@ Wavefront Runtime Stats: This is averaged over all wavefronts in a kernel dispatch. unit: Instructions per wavefront Wave Cycles: - rst: |- + rst: >- The number of cycles a wavefront in the kernel dispatch spent resident on a compute unit per :ref:`normalization unit `. This is averaged over all wavefronts in a kernel dispatch. Note: this should not @@ -853,7 +853,7 @@ Wavefront Runtime Stats: the total Wave Cycles metric. unit: Cycles per normalization unit Wavefront Occupancy: - rst: |- + rst: >- The time-averaged number of wavefronts resident on the accelerator over the lifetime of the kernel. Note: this metric may be inaccurate for short-running kernels (less than 1ms). @@ -955,7 +955,7 @@ VALU Arithmetic Instruction Mix: unit `. unit: Instructions per normalization unit Conversion: - rst: |- + rst: >- The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit `. @@ -1035,14 +1035,14 @@ MFMA Arithmetic Instruction Mix: unit: Unknown Compute Speed-of-Light: VALU FLOPs: - rst: |- + rst: >- The total floating-point operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from :ref:`MFMA ` instructions. unit: GFLOPs VALU IOPs: - rst: |- + rst: >- The total integer operations executed per second on the :ref:`VALU `. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations @@ -1052,7 +1052,7 @@ Compute Speed-of-Light: rst: '' unit: Unknown MFMA FLOPs (BF16): - rst: |- + rst: >- The total number of 16-bit brain floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit brain floating point operations from :ref:`VALU ` instructions. This is also @@ -1060,7 +1060,7 @@ Compute Speed-of-Light: on the specific accelerator. unit: GFLOPs MFMA FLOPs (F16): - rst: |- + rst: >- The total number of 16-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 16-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1068,7 +1068,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F32): - rst: |- + rst: >- The total number of 32-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 32-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1076,7 +1076,7 @@ Compute Speed-of-Light: specific accelerator. unit: GFLOPs MFMA FLOPs (F64): - rst: |- + rst: >- The total number of 64-bit floating point :ref:`MFMA ` operations executed per second. Note: this does not include any 64-bit floating point operations from :ref:`VALU ` instructions. This is also presented @@ -1091,7 +1091,7 @@ Compute Speed-of-Light: rst: '' unit: Unknown MFMA IOPs (INT8): - rst: |- + rst: >- The total number of 8-bit integer :ref:`MFMA ` operations executed per second. Note: this does not include any 8-bit integer operations from :ref:`VALU ` instructions. This is also presented as a percent @@ -1183,7 +1183,7 @@ Arithmetic Operations: unit `. unit: FLOP per normalization unit BF16 OPs: - rst: |- + rst: >- The total number of 16-bit brain floating-point operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU @@ -1203,7 +1203,7 @@ Arithmetic Operations: rst: '' unit: Unknown INT8 OPs: - rst: |- + rst: >- The total number of 8-bit integer operations executed on either the :ref:`VALU ` or :ref:`MFMA ` units, per :ref:`normalization unit `. Note: on current CDNA accelerators, the VALU has @@ -1298,7 +1298,7 @@ LDS Statistics: to stalls from non-dword aligned addresses per :ref:`normalization unit `. unit: Cycles per normalization unit Mem Violations: - rst: |- + rst: >- The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern CDNA\u2122 accelerators. @@ -1321,7 +1321,7 @@ L1I Speed-of-Light: over the number of all L1I requests. unit: Percent L1I-L2 Bandwidth Utilization: - rst: |- + rst: >- The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth achieved. Calculated as the ratio of the total number of requests from the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles `. @@ -1428,7 +1428,7 @@ Scalar L1D cache accesses: unit: Requests per normalization unit Scalar L1D Cache - L2 Interface: sL1D-L2 BW: - rst: |- + rst: >- The total number of bytes read from, written to, or atomically updated across the sL1D\u2194:doc:`L2 ` interface, divided by total duration. Note that sL1D writes and atomics are typically @@ -1450,7 +1450,7 @@ Scalar L1D Cache - L2 Interface: CDNA accelerators. unit: Requests per normalization unit Stall Cycles: - rst: |- + rst: >- The total number of cycles the sL1D\u2194 :doc:`L2 ` interface was stalled, per :ref:`normalization unit `. unit: Cycles per normalization unit @@ -1818,7 +1818,7 @@ L1 Unified Translation Cache (UTCL1): rst: '' unit: Unknown Permission Misses: - rst: |- + rst: >- The total number of translation requests that missed in the UTCL1 due to a permission error, per :ref:`normalization unit `. This is unused and expected to be zero in most configurations for modern @@ -1968,7 +1968,7 @@ L2-Fabric interface metrics: with return value) was returned to the L2. unit: Cycles Read Stall: - rst: |- + rst: >- The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_ From 624611446d3d1d331977082cbd141ee98bf81a12 Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Fri, 21 Nov 2025 10:54:25 -0500 Subject: [PATCH 06/10] Bugfixes (#1971) * Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. * Backward compatibility for rocprofiler-sdk avail module path migration * Fix roofline calculation where AI data points are N/A --- projects/rocprofiler-compute/CHANGELOG.md | 4 +- .../src/rocprof_compute_soc/soc_base.py | 86 +++++++++---------- .../src/utils/amdsmi_interface.py | 24 ++++++ .../src/utils/roofline_calc.py | 8 +- .../rocprofiler-compute/src/utils/specs.py | 50 +++++------ 5 files changed, 97 insertions(+), 75 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 755e2e34c9b..33b66e1571a 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -2,8 +2,6 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/). -## Unreleased - ## ROCm Compute Profiler 3.4.0 for ROCm 7.2.0 ### Added @@ -17,6 +15,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Adds support for dispatch timeline analysis. * Shows duration as median in addition to mean in kernel view. +* Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. + ### Changed * `-b/--block` accepts block alias(es). See block aliases using command-line option `--list-blocks `. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index aaa27bb009c..8c83cd9ec09 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -409,55 +409,53 @@ def parse_counters_text(self, text: str) -> tuple[set[str], set[str]]: def get_rocprof_supported_counters(self) -> set[str]: args = self.get_args() - rocprof_cmd = detect_rocprof(args) - - if rocprof_cmd != "rocprofiler-sdk": - console_warning( - "rocprofv3 interface is deprecated and will be removed " - "in a future release." - ) - rocprof_counters: set[str] = set() - if not ( - str(rocprof_cmd).endswith("rocprofv3") - or str(rocprof_cmd) == "rocprofiler-sdk" - ): - console_error( - f"Incompatible profiler: {rocprof_cmd}. " - "Supported profilers include: " - f"{get_submodules('rocprof_compute_profile')}" - ) + # Point to counter definition + old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH") + os.environ["ROCPROFILER_METRICS_PATH"] = str( + config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs" + ) - # Point to counter definition - old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH") - os.environ["ROCPROFILER_METRICS_PATH"] = str( - config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs" - ) - sys.path.append( - str( - Path(self.get_args().rocprofiler_sdk_library_path).parent - / "python3/site-packages" - ) - ) + # Backward compatibility support for sdk avail module moved from + # /bin/rocprofv3_avail_module/avail.py to + # /lib/python3/site-packages/rocprofv3/avail.py + new_path = str( + Path(args.rocprofiler_sdk_tool_path).parents[1] / "python3/site-packages" + ) + old_path = str(Path(args.rocprofiler_sdk_tool_path).parents[2] / "bin") + try: + sys.path.append(new_path) from rocprofv3 import avail - - avail.loadLibrary.libname = str( - Path(args.rocprofiler_sdk_library_path).parent - / "rocprofiler-sdk" - / "librocprofv3-list-avail.so" + except ImportError: + console_debug( + f"Could not import rocprofiler-sdk avail module from {new_path}, " + f"trying {old_path}" ) - counters = avail.get_counters() - rocprof_counters = { - counter.name - for counter in counters[list(counters.keys())[0]] - if hasattr(counter, "block") or hasattr(counter, "expression") - } - # Reset env. var. - if old_rocprofiler_metrics_path is None: - del os.environ["ROCPROFILER_METRICS_PATH"] - else: - os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path + try: + sys.path.remove(new_path) + sys.path.append(old_path) + from rocprofv3_avail_module import avail + except ImportError: + console_error("Failed to import rocprofiler-sdk avail module.") + + + avail.loadLibrary.libname = str( + Path(args.rocprofiler_sdk_library_path).parent + / "rocprofiler-sdk" + / "librocprofv3-list-avail.so" + ) + counters = avail.get_counters() + rocprof_counters = { + counter.name + for counter in counters[list(counters.keys())[0]] + if hasattr(counter, "block") or hasattr(counter, "expression") + } + # Reset env. var. + if old_rocprofiler_metrics_path is None: + del os.environ["ROCPROFILER_METRICS_PATH"] + else: + os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path return rocprof_counters diff --git a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py index 1675e3abe98..3bbae989b92 100644 --- a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py +++ b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py @@ -134,3 +134,27 @@ def get_gpu_memory_partition() -> str: except Exception as e: console_warning(f"Error getting GPU memory partition: {e}") return "N/A" + + +def get_amdgpu_driver_version() -> str: + """Get the AMDGPU driver version.""" + try: + driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle()) + driver_version = driver_info["driver_version"] + console_debug(f"AMDGPU Driver Version: {driver_version}") + return driver_version + except Exception as e: + console_warning(f"Error getting AMDGPU driver version: {e}") + return "N/A" + + +def get_gpu_vram_size() -> int: + """Get the GPU VRAM size in MB.""" + try: + vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle()) + vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB + console_debug(f"GPU VRAM Size: {vram_size} MB") + return vram_size + except Exception as e: + console_warning(f"Error getting GPU VRAM size: {e}") + return 0 diff --git a/projects/rocprofiler-compute/src/utils/roofline_calc.py b/projects/rocprofiler-compute/src/utils/roofline_calc.py index 187fb8dc51a..32379945346 100644 --- a/projects/rocprofiler-compute/src/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/utils/roofline_calc.py @@ -412,13 +412,13 @@ def calc_ai_analyze( metric = row.get("Metric", "") value = row.get("Value", 0) if metric == "AI HBM": - ai_hbm = value if value and value != "" else 0 + ai_hbm = value if value and value not in ("", "N/A") else 0 elif metric == "AI L2": - ai_l2 = value if value and value != "" else 0 + ai_l2 = value if value and value not in ("", "N/A") else 0 elif metric == "AI L1": - ai_l1 = value if value and value != "" else 0 + ai_l1 = value if value and value not in ("", "N/A") else 0 elif metric == "Performance (GFLOPs)": - performance = value if value and value != "" else 0 + performance = value if value and value not in ("", "N/A") else 0 console_debug( "roofline", diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index cce4b7ddf26..3143864f049 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -43,9 +43,11 @@ import config from utils.amdsmi_interface import ( amdsmi_ctx, + get_amdgpu_driver_version, get_gpu_compute_partition, get_gpu_memory_partition, get_gpu_vbios_part_number, + get_gpu_vram_size, ) from utils.logger import ( console_debug, @@ -166,25 +168,26 @@ def generate_machine_specs( soc_info = extract_soc_info() # Combine all specifications - specs = MachineSpecs( - version=specs_version, - timestamp=timestamp, - rocminfo_lines=soc_info["rocminfo_lines"], - hostname=socket.gethostname(), - cpu_model=machine_info["cpu_model"], - sbios=machine_info["sbios"], - linux_kernel_version=machine_info["linux_kernel_version"], - amd_gpu_kernel_version="", - cpu_memory=machine_info["cpu_memory"], - gpu_memory="", - linux_distro=machine_info["linux_distro"], - rocm_version=get_rocm_ver().strip(), - vbios=gpu_info["vbios"], - compute_partition=gpu_info["compute_partition"], - memory_partition=gpu_info["memory_partition"], - gpu_arch=soc_info["gpu_arch"], - gpu_chip_id=soc_info["gpu_chip_id"], - ) + with amdsmi_ctx(): + specs = MachineSpecs( + version=specs_version, + timestamp=timestamp, + rocminfo_lines=soc_info["rocminfo_lines"], + hostname=socket.gethostname(), + cpu_model=machine_info["cpu_model"], + sbios=machine_info["sbios"], + linux_kernel_version=machine_info["linux_kernel_version"], + amd_gpu_kernel_version=get_amdgpu_driver_version(), + cpu_memory=machine_info["cpu_memory"], + gpu_memory=get_gpu_vram_size(), + linux_distro=machine_info["linux_distro"], + rocm_version=get_rocm_ver().strip(), + vbios=gpu_info["vbios"], + compute_partition=gpu_info["compute_partition"], + memory_partition=gpu_info["memory_partition"], + gpu_arch=soc_info["gpu_arch"], + gpu_chip_id=soc_info["gpu_chip_id"], + ) # Load above SoC specs via module import try: @@ -420,10 +423,7 @@ class MachineSpecs: amd_gpu_kernel_version: Optional[str] = field( default=None, metadata={ - "doc": ( - "[RESERVED] The version of the AMDGPU driver installed on the machine. " - "Unimplemented." - ), + "doc": ("The version of the AMDGPU driver installed on the machine."), "name": "AMD GPU Kernel Version", "show_in_table": True, }, @@ -441,8 +441,8 @@ class MachineSpecs: default=None, metadata={ "doc": ( - "[RESERVED] The total amount of memory available to accelerators/GPUs " - "in the system. Unimplemented." + "The total amount of memory available to accelerators/GPUs " + "in the system." ), "unit": "KB", "name": "GPU Memory", From 313a629aefece5466400e122475bebad31b6934c Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Thu, 27 Nov 2025 09:13:19 -0500 Subject: [PATCH 07/10] Fix sL1D values in memory chart (#2037) --- projects/rocprofiler-compute/CHANGELOG.md | 2 ++ projects/rocprofiler-compute/src/utils/mem_chart.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 33b66e1571a..c5a1296c3f1 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -38,6 +38,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ### Resolved issues +* Fixed sL1D metric values showing up as N/A in memory chart diagram + ### Known issues ### Upcoming changes diff --git a/projects/rocprofiler-compute/src/utils/mem_chart.py b/projects/rocprofiler-compute/src/utils/mem_chart.py index 22e6e105738..8389d73e2f8 100644 --- a/projects/rocprofiler-compute/src/utils/mem_chart.py +++ b/projects/rocprofiler-compute/src/utils/mem_chart.py @@ -1075,7 +1075,7 @@ def draw( wires_E_GLV.vl1_rd = metric_dict.get("VL1 Rd", "n/a") wires_E_GLV.vl1_wr = metric_dict.get("VL1 Wr", "n/a") wires_E_GLV.vl1_atomic = metric_dict.get("VL1 Atomic", "n/a") - wires_E_GLV.sl1_rd = metric_dict.get("VL1D Rd", "n/a") + wires_E_GLV.sl1_rd = metric_dict.get("sL1D Rd", "n/a") wires_E_GLV.draw(canvas) @@ -1170,9 +1170,9 @@ def draw( wires_L1_L2.vl1_l2_rd = metric_dict.get("VL1_L2 Rd", "n/a") wires_L1_L2.vl1_l2_wr = metric_dict.get("VL1_L2 Wr", "n/a") wires_L1_L2.vl1_l2_atomic = metric_dict.get("VL1_L2 Atomic", "n/a") - wires_L1_L2.sl1_l2_rd = metric_dict.get("VL1D_L2 Rd", "n/a") - wires_L1_L2.sl1_l2_wr = metric_dict.get("VL1D_L2 Wr", "n/a") - wires_L1_L2.sl1_l2_atomic = metric_dict.get("VL1D_L2 Atomic", "n/a") + wires_L1_L2.sl1_l2_rd = metric_dict.get("sL1D_L2 Rd", "n/a") + wires_L1_L2.sl1_l2_wr = metric_dict.get("sL1D_L2 Wr", "n/a") + wires_L1_L2.sl1_l2_atomic = metric_dict.get("sL1D_L2 Atomic", "n/a") wires_L1_L2.il1_l2_req = metric_dict.get("IL1_L2 Rd", "n/a") wires_L1_L2.draw(canvas) From 67b81a3f54fd5c92576df042259d49c07e40a823 Mon Sep 17 00:00:00 2001 From: Pratik Basyal Date: Thu, 27 Nov 2025 18:55:45 -0500 Subject: [PATCH 08/10] Formatting fixed (#1691) --- .../docs/how-to/analyze/cli.rst | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst index c6013455d3c..2316e6bdea5 100644 --- a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst +++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst @@ -10,17 +10,13 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur * :ref:`Derived metrics `: All of ROCm Compute Profiler's built-in metrics. -* :ref:`Baseline comparison `: Compare multiple - runs in a side-by-side manner. +* :ref:`Baseline comparison `: Compare multiple runs in a side-by-side manner. -* :ref:`Metric customization `: Isolate a subset of - built-in metrics or build your own profiling configuration. +* :ref:`Metric customization `: Isolate a subset of built-in metrics or build your own profiling configuration. -* :ref:`Filtering `: Hone in on a particular kernel, - GPU ID, or dispatch ID via post-process filtering. - -* :ref:`Per-kernel roofline analysis `: Detailed arithmetic - intensity and performance analysis for individual kernels. +* :ref:`Filtering `: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering. + +* :ref:`Per-kernel roofline analysis `: Detailed arithmetic intensity and performance analysis for individual kernels. Run ``rocprof-compute analyze -h`` for more details. From c94ecf9920ba4bdb20c274a75ec8e6d3c1577412 Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Fri, 28 Nov 2025 11:32:00 -0500 Subject: [PATCH 09/10] [rocprofiler-compute] Only depend on amdsmi in profile phase (#2044) * Only depepnd on amdsmi in profile phase * amdsmi interface tests should have common prefix for easier testing --- .../src/utils/amdsmi_interface.py | 40 +++++++++++++++---- .../rocprofiler-compute/tests/test_utils.py | 40 +++++++++++++------ 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py index 3bbae989b92..fa915734d64 100644 --- a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py +++ b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py @@ -34,18 +34,35 @@ console_warning, ) -sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi") +_amdsmi_module = None -try: - import amdsmi -except ImportError as e: - console_warning(f"Unhandled import error: {e}") - console_error("Failed to import the amdsmi Python library.") + +# Ignore undefined name amdsmi since it's dynamically imported +def import_amdsmi_module() -> "amdsmi": # noqa: F821 + """ + Dynamically import the amdsmi module because we only + want profile time dependency on amdsmi. + Uses global cache to avoid repeated imports. + """ + global _amdsmi_module + + if not _amdsmi_module: + sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi") + try: + import amdsmi + + _amdsmi_module = amdsmi + except ImportError as e: + console_warning(f"Unhandled import error: {e}") + console_error("Failed to import the amdsmi Python library.") + + return _amdsmi_module @contextmanager def amdsmi_ctx() -> Iterator[None]: """Context manager to initialize and shutdown amdsmi.""" + amdsmi = import_amdsmi_module() try: amdsmi.amdsmi_init() yield @@ -58,8 +75,10 @@ def amdsmi_ctx() -> Iterator[None]: console_warning(f"amd-smi shutdown failed: {e}") -def get_device_handle() -> "amdsmi.ProcessorHandle | None": +# Ignore undefined name amdsmi since it's dynamically imported +def get_device_handle() -> "amdsmi.ProcessorHandle | None": # noqa: F821 """Get the first AMD device handle.""" + amdsmi = import_amdsmi_module() try: devices = amdsmi.amdsmi_get_processor_handles() if len(devices) == 0: @@ -74,6 +93,7 @@ def get_device_handle() -> "amdsmi.ProcessorHandle | None": def get_mem_max_clock() -> float: """Get the maximum memory clock of the device.""" + amdsmi = import_amdsmi_module() try: return amdsmi.amdsmi_get_clock_info( get_device_handle(), amdsmi.AmdSmiClkType.GFX @@ -85,6 +105,7 @@ def get_mem_max_clock() -> float: def get_gpu_model() -> str: """Get the GPU model name.""" + amdsmi = import_amdsmi_module() try: gpu_model_info = ( # board -> product_name @@ -103,6 +124,7 @@ def get_gpu_model() -> str: def get_gpu_vbios_part_number() -> str: """Get the GPU VBIOS part number.""" + amdsmi = import_amdsmi_module() try: vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[ "part_number" @@ -116,6 +138,7 @@ def get_gpu_vbios_part_number() -> str: def get_gpu_compute_partition() -> str: """Get the GPU compute partition.""" + amdsmi = import_amdsmi_module() try: compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle()) console_debug(f"GPU Compute Partition: {compute_partition}") @@ -127,6 +150,7 @@ def get_gpu_compute_partition() -> str: def get_gpu_memory_partition() -> str: """Get the GPU memory partition.""" + amdsmi = import_amdsmi_module() try: memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle()) console_debug(f"GPU Memory Partition: {memory_partition}") @@ -138,6 +162,7 @@ def get_gpu_memory_partition() -> str: def get_amdgpu_driver_version() -> str: """Get the AMDGPU driver version.""" + amdsmi = import_amdsmi_module() try: driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle()) driver_version = driver_info["driver_version"] @@ -150,6 +175,7 @@ def get_amdgpu_driver_version() -> str: def get_gpu_vram_size() -> int: """Get the GPU VRAM size in MB.""" + amdsmi = import_amdsmi_module() try: vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle()) vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 48bd0e02ce2..09086679cba 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -8677,7 +8677,9 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys): def test_amdsmi_ctx(): - from utils.amdsmi_interface import amdsmi_ctx + from utils.amdsmi_interface import amdsmi_ctx, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock: with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock: @@ -8686,8 +8688,10 @@ def test_amdsmi_ctx(): amdsmi_shutdown_mock.assert_called_once() -def test_get_device_handle(): - from utils.amdsmi_interface import get_device_handle +def test_amdsmi_get_device_handle(): + from utils.amdsmi_interface import get_device_handle, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8701,8 +8705,10 @@ def test_get_device_handle(): assert handle is None -def test_get_mem_max_clock(): - from utils.amdsmi_interface import get_mem_max_clock +def test_amdsmi_get_mem_max_clock(): + from utils.amdsmi_interface import get_mem_max_clock, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8713,8 +8719,10 @@ def test_get_mem_max_clock(): assert clk == 100 -def test_get_gpu_model(): - from utils.amdsmi_interface import get_gpu_model +def test_amdsmi_get_gpu_model(): + from utils.amdsmi_interface import get_gpu_model, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8735,8 +8743,10 @@ def test_get_gpu_model(): assert model == "N/A" -def test_get_gpu_vbios_part_number(): - from utils.amdsmi_interface import get_gpu_vbios_part_number +def test_amdsmi_get_gpu_vbios_part_number(): + from utils.amdsmi_interface import get_gpu_vbios_part_number, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8755,8 +8765,10 @@ def test_get_gpu_vbios_part_number(): assert part_number == "N/A" -def test_get_gpu_compute_partition(): - from utils.amdsmi_interface import get_gpu_compute_partition +def test_amdsmi_get_gpu_compute_partition(): + from utils.amdsmi_interface import get_gpu_compute_partition, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8776,8 +8788,10 @@ def test_get_gpu_compute_partition(): assert partition == "N/A" -def test_get_gpu_memory_partition(): - from utils.amdsmi_interface import get_gpu_memory_partition +def test_amdsmi_get_gpu_memory_partition(): + from utils.amdsmi_interface import get_gpu_memory_partition, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] From 1214857ce881e5a1f6ef27c234d844b28b01c5cb Mon Sep 17 00:00:00 2001 From: abchoudh-amd Date: Fri, 28 Nov 2025 22:02:25 +0530 Subject: [PATCH 10/10] Add CU Utilization and deprecate Active CUs (#1822) * ChangeLog * Deprecation notice in old arch * Deprecation notice current arch * New config hash * Added Config deltas * Added metric description --- projects/rocprofiler-compute/CHANGELOG.md | 6 + .../docs/data/metrics_description.yaml | 10 +- .../gfx908/0200_system_speed_of_light.yaml | 15 +- .../gfx908/0300_memory_chart.yaml | 6 +- .../gfx908/config_delta/gfx950_diff.yaml | 1043 +++++++++++------ .../gfx90a/0200_system_speed_of_light.yaml | 15 +- .../gfx90a/0300_memory_chart.yaml | 6 +- .../gfx90a/config_delta/gfx950_diff.yaml | 840 +++++++------ .../gfx940/0200_system_speed_of_light.yaml | 15 +- .../gfx940/0300_memory_chart.yaml | 6 +- .../gfx940/config_delta/gfx950_diff.yaml | 598 ++++++---- .../gfx941/0200_system_speed_of_light.yaml | 15 +- .../gfx941/0300_memory_chart.yaml | 6 +- .../gfx941/config_delta/gfx950_diff.yaml | 564 +++++---- .../gfx942/0200_system_speed_of_light.yaml | 15 +- .../gfx942/0300_memory_chart.yaml | 6 +- .../gfx942/config_delta/gfx950_diff.yaml | 586 +++++---- .../gfx950/0200_system_speed_of_light.yaml | 15 +- .../gfx950/0300_memory_chart.yaml | 6 +- .../config_management/.config_hashes.json | 36 +- 20 files changed, 2387 insertions(+), 1422 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index c5a1296c3f1..39ce7428207 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -17,6 +17,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. +* Added `CU Utilization` metric to display the percentage of CUs utilized during kernel execution. + ### Changed * `-b/--block` accepts block alias(es). See block aliases using command-line option `--list-blocks `. @@ -28,6 +30,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Empty cells replaced with `N/A` for unavailable metrics in analysis. +### Deprecated + +* `Active CUs` metric has been deprecated and replaced by `CU Utilization`. + ### Removed * Removed `database` mode from ROCm Compute Profiler in favor of other visualization methods, rather than Grafana and MongoDB integration, such as the upcoming Analysis DB-based Visualizer. diff --git a/projects/rocprofiler-compute/docs/data/metrics_description.yaml b/projects/rocprofiler-compute/docs/data/metrics_description.yaml index 4c60cf24a18..8d2d13354ba 100644 --- a/projects/rocprofiler-compute/docs/data/metrics_description.yaml +++ b/projects/rocprofiler-compute/docs/data/metrics_description.yaml @@ -1423,9 +1423,9 @@ Command processor packet processor (CPC): manager `. unit: Percent System Speed-of-Light: - Active CUs: + Active CUs (deprecated): rst: Total number of active compute units (CUs) on the accelerator during the - kernel execution. + kernel execution. (Deprecated - See CU Utilization instead) unit: Number Branch Utilization: rst: Indicates what percent of the kernel's duration the :ref:`branch ` @@ -1618,3 +1618,9 @@ System Speed-of-Light: rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache over the total number of cache line requests to the :ref:`vL1D cache RAM `. unit: Percent + CU Utilization: + rst: The percent of :ref:`total SIMD cycles ` in the kernel + where any :ref:`SIMD ` on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. + unit: Percent diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml index 6fca0c579c0..481409c22bf 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml @@ -48,7 +48,7 @@ Panel Config: unit: GIOP/s peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) pop: None - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -199,6 +199,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -235,8 +240,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -321,3 +326,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml index db190f03ef6..8ca0ddc7b94 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -167,8 +167,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml index b90fd37e868..700ea25c660 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml @@ -20,6 +20,12 @@ Addition: peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + metric_descriptions: + MFMA FLOPs (F8): + plain: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - Panel Config: id: 400 title: Roofline @@ -38,6 +44,17 @@ Addition: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s peak: $MFMAF8Flops_empirical_peak + metric_descriptions: + MFMA FLOPs (F6F4): + plain: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + rst: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + MFMA FLOPs (F8): + plain: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - Panel Config: id: 500 title: Command Processor (CPC/CPF) @@ -51,6 +68,11 @@ Addition: min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) unit: pct + - CPC CANE Stall Rate: + avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + unit: pct - CPC SYNC FIFO Full Rate: avg: | AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) @@ -59,11 +81,6 @@ Addition: max: | MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) unit: pct - - CPC CANE Stall Rate: - avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - unit: pct - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -127,20 +144,15 @@ Addition: id: 1002 title: VALU Arithmetic Instruction Mix metrics: - - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + - Conversion: + avg: AVG((SQ_INSTS_VALU_CVT / $denom)) + min: MIN((SQ_INSTS_VALU_CVT / $denom)) + max: MAX((SQ_INSTS_VALU_CVT / $denom)) unit: (instr + $normUnit) - - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + - F16-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) unit: (instr + $normUnit) - F16-FMA: avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) @@ -152,55 +164,60 @@ Addition: min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) unit: (instr + $normUnit) - - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + - F16-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) unit: (instr + $normUnit) - - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) + - F32-ADD: + avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) unit: (instr + $normUnit) - F32-FMA: avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) unit: (instr + $normUnit) + - F32-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) + unit: (instr + $normUnit) + - F32-Trans: + avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) + unit: (instr + $normUnit) - F64-ADD: avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) unit: (instr + $normUnit) - - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) + - F64-FMA: + avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) + unit: (instr + $normUnit) + - F64-MUL: + avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) + min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) + max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) unit: (instr + $normUnit) - F64-Trans: avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) unit: (instr + $normUnit) - - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) + - INT32: + avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) + min: MIN((SQ_INSTS_VALU_INT32 / $denom)) + max: MAX((SQ_INSTS_VALU_INT32 / $denom)) unit: (instr + $normUnit) - - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) + - INT64: + avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) + min: MIN((SQ_INSTS_VALU_INT64 / $denom)) + max: MAX((SQ_INSTS_VALU_INT64 / $denom)) unit: (instr + $normUnit) - metric_table: id: 1003 @@ -215,26 +232,26 @@ Addition: id: 1004 title: MFMA Arithmetic Instruction Mix metrics: + - MFMA-BF16: + avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + unit: (instr + $normUnit) - MFMA-F16: avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) unit: (instr + $normUnit) - - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) + - MFMA-F32: + avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) unit: (instr + $normUnit) - MFMA-F64: avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) unit: (instr + $normUnit) - - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F6F4: avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) @@ -245,11 +262,130 @@ Addition: min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) unit: (instr + $normUnit) - - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) + - MFMA-I8: + avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) unit: (instr + $normUnit) + metric_descriptions: + Conversion: + plain: | + The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. + rst: | + The total number of type conversion instructions (such as converting data to or from F32\u2194F64) issued to the VALU per normalization unit. + F16-ADD: + plain: | + The total number of addition instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of addition instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + F16-FMA: + plain: | + The total number of fused multiply-add instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of fused multiply-add instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + F16-MUL: + plain: | + The total number of multiplication instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of multiplication instructions operating on 16-bit floating-point operands issued to the VALU per normalization unit. + F16-Trans: + plain: | + The total number of transcendental instructions (e.g., sqrt) operating on 16-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of transcendental instructions (e.g., sqrt) operating on 16-bit floating-point operands issued to the VALU per normalization unit. + F32-ADD: + plain: | + The total number of addition instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of addition instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + F32-FMA: + plain: | + The total number of fused multiply-add instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of fused multiply-add instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + F32-MUL: + plain: | + The total number of multiplication instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of multiplication instructions operating on 32-bit floating-point operands issued to the VALU per normalization unit. + F32-Trans: + plain: | + The total number of transcendental instructions (such as sqrt) operating on 32-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of transcendental instructions (such as sqrt) operating on 32-bit floating-point operands issued to the VALU per normalization unit. + F64-ADD: + plain: | + The total number of addition instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of addition instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + F64-FMA: + plain: | + The total number of fused multiply-add instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of fused multiply-add instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + F64-MUL: + plain: | + The total number of multiplication instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of multiplication instructions operating on 64-bit floating-point operands issued to the VALU per normalization unit. + F64-Trans: + plain: | + The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. + rst: | + The total number of transcendental instructions (such as sqrt) operating on 64-bit floating-point operands issued to the VALU per normalization unit. + INT32: + plain: | + The total number of instructions operating on 32-bit integer operands issued to the VALU per normalization unit. + rst: | + The total number of instructions operating on 32-bit integer operands issued to the VALU per normalization unit. + INT64: + plain: | + The total number of instructions operating on 64-bit integer operands issued to the VALU per normalization unit. + rst: | + The total number of instructions operating on 64-bit integer operands issued to the VALU per normalization unit. + MFMA: + plain: The total number of matrix fused multiply-add instructions issued. + rst: The total number of matrix fused multiply-add instructions issued. + MFMA-BF16: + plain: | + The total number of 16-bit brain floating point MFMA instructions issued per normalization unit. + rst: | + The total number of 16-bit brain floating point MFMA instructions issued per normalization unit. + MFMA-F16: + plain: | + The total number of 16-bit floating point MFMA instructions issued per normalization unit. + rst: | + The total number of 16-bit floating point MFMA instructions issued per normalization unit. + MFMA-F32: + plain: | + The total number of 32-bit floating-point MFMA instructions issued per normalization unit. + rst: | + The total number of 32-bit floating-point MFMA instructions issued per normalization unit. + MFMA-F64: + plain: | + The total number of 64-bit floating-point MFMA instructions issued per normalization unit. + rst: | + The total number of 64-bit floating-point MFMA instructions issued per normalization unit. + MFMA-F8: + plain: | + The total number of 8-bit floating point MFMA instructions issued per normalization unit. This is supported in AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit floating point MFMA instructions issued per normalization unit. This is supported in AMD Instinct MI300 series and later only. + MFMA-I8: + plain: | + The total number of 8-bit integer MFMA instructions issued per normalization unit. + rst: | + The total number of 8-bit integer MFMA instructions issued per normalization unit. + VALU: + plain: | + The total number of vector arithmetic logic unit (VALU) operations issued. These are the workhorses of the compute unit, and are used to execute a wide range of instruction types including floating point operations, non-uniform address calculations, transcendental operations, integer operations, shifts, conditional evaluation, etc. + rst: | + The total number of vector arithmetic logic unit (VALU) operations issued. These are the workhorses of the compute unit, and are used to execute a wide range of instruction types including floating point operations, non-uniform address calculations, transcendental operations, integer operations, shifts, conditional evaluation, etc. + VMEM: + plain: | + The total number of vector memory operations issued. These include most loads, stores and atomic operations and all accesses to generic, global, private and texture memory. + rst: | + The total number of vector memory operations issued. These include most loads, stores and atomic operations and all accesses to generic, global, private and texture memory. - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -258,37 +394,30 @@ Addition: id: 1101 title: Compute Speed-of-Light metrics: - - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - VALU FLOPs: - value: | - AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + - MFMA FLOPs (F16): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | - ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - MFMA FLOPs (F32): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F6F4): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP @@ -301,12 +430,19 @@ Addition: peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) + - MFMA IOPs (INT8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GIOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + - VALU FLOPs: + value: | + AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) + peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: value: | AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) @@ -318,28 +454,11 @@ Addition: id: 1102 title: Pipeline Statistics metrics: - - MFMA Utilization: - avg: | - AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: | - MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: | - MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - - VMEM Utilization: - avg: | - AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: | - MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: | - MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + - Branch Utilization: + avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) unit: pct - - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - MFMA Instruction Cycles: avg: | AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) else None)) @@ -348,48 +467,55 @@ Addition: max: | MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) else None)) unit: cycles/instr + - MFMA Utilization: + avg: | + AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + min: | + MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + max: | + MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) + unit: pct - SMEM Latency: avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None)) min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None)) max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None)) unit: Cycles coll_level: SQ_INST_LEVEL_SMEM - - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Co-Issue Efficiency: avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) unit: pct + - VMEM Latency: + avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) + min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) + max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None)) + unit: Cycles + coll_level: SQ_INST_LEVEL_VMEM + - VMEM Utilization: + avg: | + AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + min: | + MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + max: | + MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + unit: pct - metric_table: id: 1103 title: Arithmetic Operations metrics: - - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) unit: (OPs + $normUnit) - - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - - IOPs (Total): + - F16 OPs: avg: | - AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) + AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) min: | - MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) + MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) max: | - MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) + MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) unit: (OPs + $normUnit) - F32 OPs: avg: | @@ -399,19 +525,24 @@ Addition: max: | MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom)) unit: (OPs + $normUnit) - - F16 OPs: + - F64 OPs: avg: | - AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) min: | - MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) max: | - MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) + MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) unit: (OPs + $normUnit) - F6F4 OPs: avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) unit: (OPs + $normUnit) + - F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) - FLOPs (Total): avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) @@ -420,14 +551,120 @@ Addition: max: | MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) unit: (OPs + $normUnit) - - F64 OPs: + - INT8 OPs: + avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) + unit: (OPs + $normUnit) + - IOPs (Total): avg: | - AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) min: | - MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) max: | - MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) + MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom) unit: (OPs + $normUnit) + metric_descriptions: + BF16 OPs: + plain: | + The total number of 16-bit brain floating-point operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of 16-bit brain floating-point operations executed on either the VALU or MFMA units, per normalization unit. + Branch Utilization: + plain: | + Indicates what percent of the kernel's duration the branch unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing branch instructions over the total CU cycles. + rst: | + Indicates what percent of the kernel's duration the branch unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing branch instructions over the total CU cycles. + F16 OPs: + plain: | + The total number of 16-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of 16-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + F32 OPs: + plain: | + The total number of 32-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of 32-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + F64 OPs: + plain: | + The total number of 64-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of 64-bit floating-point operations executed on either the VALU or MFMA units, per normalization unit. + FLOPs (Total): + plain: | + The total number of floating-point operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of floating-point operations executed on either the VALU or MFMA units, per normalization unit. + INT8 OPs: + plain: | + The total number of 8-bit integer operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of 8-bit integer operations executed on either the VALU or MFMA units, per normalization unit. + IOPs (Total): + plain: | + The total number of integer operations executed on either the VALU or MFMA units, per normalization unit. + rst: | + The total number of integer operations executed on either the VALU or MFMA units, per normalization unit. + MFMA FLOPs (BF16): + plain: | + The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. + rst: | + The total number of 16-bit brain floating point MFMA operations executed per second. Note: this does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical BF16 MFMA operations achievable on the specific accelerator. + MFMA FLOPs (F16): + plain: | + The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. + rst: | + The total number of 16-bit floating point MFMA operations executed per second. Note: this does not include any 16-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F16 MFMA operations achievable on the specific accelerator. + MFMA FLOPs (F32): + plain: | + The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. + rst: | + The total number of 32-bit floating point MFMA operations executed per second. Note: this does not include any 32-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F32 MFMA operations achievable on the specific accelerator. + MFMA FLOPs (F64): + plain: | + The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + rst: | + The total number of 64-bit floating point MFMA operations executed per second. Note: this does not include any 64-bit floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F64 MFMA operations achievable on the specific accelerator. + MFMA IOPs (INT8): + plain: | + The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + rst: | + The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. + MFMA Instruction Cycles: + plain: | + The average duration of MFMA instructions in this kernel in cycles. Computed as the ratio of the total number of cycles the MFMA unit was busy over the total number of MFMA instructions. + rst: | + The average duration of MFMA instructions in this kernel in cycles. Computed as the ratio of the total number of cycles the MFMA unit was busy over the total number of MFMA instructions. + MFMA Utilization: + plain: | + Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the MFMA was busy over the total CU cycles. + rst: | + Indicates what percent of the kernel's duration the MFMA unit was busy executing instructions. Computed as the ratio of the total number of cycles spent by the MFMA was busy over the total CU cycles. + SMEM Latency: + plain: | + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) required for a SMEM instruction to complete. + rst: | + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) required for a SMEM instruction to complete. + VALU FLOPs: + plain: | + The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. + rst: | + The total floating-point operations executed per second on the VALU. This is also presented as a percent of the peak theoretical FLOPs achievable on the specific accelerator. Note: this does not include any floating-point operations from MFMA instructions. + VALU IOPs: + plain: | + The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. + rst: | + The total integer operations executed per second on the VALU. This is also presented as a percent of the peak theoretical IOPs achievable on the specific accelerator. Note: this does not include any integer operations from MFMA instructions. + VMEM Latency: + plain: | + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) required for a VMEM instruction to complete. + rst: | + The average number of round-trip cycles (that is, from issue to data return / acknowledgment) required for a VMEM instruction to complete. + VMEM Utilization: + plain: | + Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions, including both global/generic and spill/scratch operations (see the VMEM instruction count metrics for more detail). Does not include VALU operations. Computed as the ratio of the total number of cycles spent by the scheduler issuing VMEM instructions over the total CU cycles. + rst: | + Indicates what percent of the kernel's duration the VMEM unit was busy executing instructions, including both global/generic and spill/scratch operations (see the VMEM instruction count metrics for more detail). Does not include VALU operations. Computed as the ratio of the total number of cycles spent by the scheduler issuing VMEM instructions over the total CU cycles. - Panel Config: id: 1200 title: Local Data Share (LDS) @@ -436,16 +673,31 @@ Addition: id: 1202 title: LDS Statistics metrics: - - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + - LDS ATOMIC: + avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) + min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) + max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) + unit: (instr + $normUnit) + - LDS ATOMIC Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps + - LDS Command FIFO Full Rate: + avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) - LDS Data FIFO Full Rate: avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) + - LDS LOAD: + avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) + min: MIN((SQ_INSTS_LDS_LOAD / $denom)) + max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + unit: (instr + $normUnit) - LDS LOAD Bandwidth: avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) @@ -456,62 +708,33 @@ Addition: min: MIN((SQ_INSTS_LDS_STORE / $denom)) max: MAX((SQ_INSTS_LDS_STORE / $denom)) unit: (instr + $normUnit) - - LDS ATOMIC: - avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) - min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) - max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) - unit: (instr + $normUnit) - - LDS Command FIFO Full Rate: - avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - - LDS LOAD: - avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) - min: MIN((SQ_INSTS_LDS_LOAD / $denom)) - max: MAX((SQ_INSTS_LDS_LOAD / $denom)) - unit: (instr + $normUnit) - - LDS ATOMIC Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + - LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) metric_tables: - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - metrics: - - Workgroup manager → Data-Return Stall: - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metric_table: id: 1501 title: Busy and stall metrics metrics: - - Sequencer → TA Data Stall: - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + - Sequencer → TA Address Stall: + avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) - Sequencer → TA Command Stall: avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) - - Sequencer → TA Address Stall: - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) + - Sequencer → TA Data Stall: + avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) - metric_table: id: 1502 @@ -527,6 +750,31 @@ Addition: min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + metrics: + - Workgroup manager → Data-Return Stall: + avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) + unit: pct + - Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + metric_descriptions: + Workgroup manager → Data-Return Stall: + plain: | + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due to initialization of registers as a part of launching new workgroups. + rst: | + Percent of the total CU cycles the data-return unit was stalled by the workgroup manager due to initialization of registers as a part of launching new workgroups. + Write Ack Instructions: + plain: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. + rst: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -538,15 +786,15 @@ Addition: - Stalled on Address: expr: | (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Read Return: - expr: | - (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Data: expr: | (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Latency FIFO: expr: | (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Read Return: + expr: | + (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Request FIFO: expr: | (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) @@ -554,6 +802,16 @@ Addition: id: 1603 title: vL1D cache access metrics metrics: + - Tag RAM 0 Req: + avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Tag RAM 1 Req: + avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) + unit: (Req + $normUnit) - Tag RAM 2 Req: avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) @@ -564,69 +822,59 @@ Addition: min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) unit: (Req + $normUnit) - - Tag RAM 1 Req: - avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) - unit: (Req + $normUnit) - - Tag RAM 0 Req: - avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) - unit: (Req + $normUnit) - metric_table: id: 1605 title: L1 Unified Translation Cache (UTCL1) metrics: - - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - Inflight Req: avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) units: (Req + $normUnit) + - Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) - metric_table: id: 1606 title: L1D Addr Translation Stalls metrics: - - Thrashing Stall: - avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - units: (Cycles + $normUnit) - - Serialization Stall: - avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + - Cache Full Stall: + avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) units: (Cycles + $normUnit) - Cache Miss Stall: avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) units: (Cycles + $normUnit) + - Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + units: (Cycles + $normUnit) - Resident Page Full Stall: avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) units: (Cycles + $normUnit) - - Cache Full Stall: - avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + - Serialization Stall: + avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + units: (Cycles + $normUnit) + - Thrashing Stall: + avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) units: (Cycles + $normUnit) - UTCL2 Stall: avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) units: (Cycles + $normUnit) - - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - units: (Cycles + $normUnit) - Panel Config: id: 1700 title: L2 Cache @@ -655,11 +903,6 @@ Addition: id: 1703 title: L2 Cache Accesses metrics: - - Read Bandwidth: - avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth: avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -675,6 +918,11 @@ Addition: min: MIN((TCC_IB_REQ_sum / $denom)) max: MAX((TCC_IB_REQ_sum / $denom)) unit: (Req + $normUnit) + - Read Bandwidth: + avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Write Bandwidth: avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -683,17 +931,17 @@ Addition: - metric_table: id: 1704 title: L2 Cache Stalls - metrics: - - Stalled on Latency FIFO: - avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) - min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) - max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) + metrics: - Input Buffer Stalled on L2: avg: AVG(TCC_IB_STALL_sum / $denom) min: MIN(TCC_IB_STALL_sum / $denom) max: MAX(TCC_IB_STALL_sum / $denom) unit: (Cycles + $normUnit) + - Stalled on Latency FIFO: + avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) + min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) + max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) - Stalled on Write Data FIFO: avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) @@ -713,15 +961,15 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - HBM Stall: - type: HBM Stall - transaction: Write + - Read - Infinity Fabric Stall: + type: Infinity Fabric™ Stall + transaction: Read avg: | - AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - Read - PCIe Stall: type: PCIe Stall @@ -733,6 +981,16 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct + - Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: | + AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct - Write - Infinity Fabric Stall: type: Infinity Fabric™ Stall transaction: Write @@ -753,16 +1011,6 @@ Addition: max: | MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Read - Infinity Fabric Stall: - type: Infinity Fabric™ Stall - transaction: Read - avg: | - AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics @@ -772,21 +1020,26 @@ Addition: min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) unit: (Req + $normUnit) + - Atomic Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Atomic Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Atomic Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps + - Read (128B): + avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) + unit: (Req + $normUnit) - Read Bandwidth - HBM: avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -797,31 +1050,127 @@ Addition: min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Atomic Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read Bandwidth - PCIe: avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Write Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Read (128B): - avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) - unit: (Req + $normUnit) + - Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + metric_descriptions: + Atomic Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + Atomic Bandwidth - HBM: + plain: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + Atomic Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + Atomic Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + Read - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + Read - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Read - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Read Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + Read Bandwidth - HBM: + plain: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + Read Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + Read Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + Read Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + Write - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + Write - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Write - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Write Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + Write Bandwidth - HBM: + plain: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + Write Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + Write Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + Write Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. Deletion: [] @@ -835,68 +1184,68 @@ Modification: id: 201 title: System Speed-of-Light metrics: - - VALU FLOPs: + - Branch Utilization: + pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + - L2 Cache BW: + pop: | + ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) + - L2-Fabric Read BW: + pop: | + ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) value: | - AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + - MFMA FLOPs (BF16): pop: | - ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F32): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - - VMEM Utilization: + - MFMA IOPs (Int8): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA Utilization: + pop: | + AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) value: | - AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) + AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) + - VALU FLOPs: pop: | - AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - - VALU IOPs: + ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) value: | - AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) + AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) + - VALU IOPs: pop: | ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - pop: | - ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - - MFMA Utilization: value: | - AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) + AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp))) + - VMEM Utilization: pop: | - AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4))) - - L2-Fabric Read BW: + AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) value: | - AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) - pop: | - ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) pop: | ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - Panel Config: id: 300 title: Memory Chart @@ -905,10 +1254,10 @@ Modification: id: 301 title: Memory Chart metrics: - - Wavefronts: - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - MFMA: value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) + - Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - Workgroups: value: | ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) @@ -923,15 +1272,15 @@ Modification: - AI HBM: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + - AI L1: + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - AI L2: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) - Performance (GFLOPs): value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 - - AI L1: - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -940,13 +1289,6 @@ Modification: id: 601 title: Workgroup manager utilizations metrics: - - SGPR Writes: - max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - avg: | - AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Dispatched Wavefronts: max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) @@ -958,6 +1300,13 @@ Modification: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + - SGPR Writes: + max: | + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + avg: | + AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: max: | MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) @@ -999,11 +1348,15 @@ Modification: id: 1603 title: vL1D cache access metrics metrics: - - L1-L2 Write Latency: - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + - Cache BW: + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + - L1 Access Latency: + max: MAX((TCP_TCP_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) - L1-L2 BW: max: | MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) @@ -1011,20 +1364,16 @@ Modification: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) min: | MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - - L1 Access Latency: - max: MAX((TCP_TCP_LATENCY_sum / $denom)) - avg: AVG((TCP_TCP_LATENCY_sum / $denom)) - min: MIN((TCP_TCP_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - L1-L2 Read Latency: max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + - L1-L2 Write Latency: + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) - - Cache BW: - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -1043,21 +1392,14 @@ Modification: id: 1702 title: L2-Fabric interface metrics metrics: - - Remote Write and Atomic Traffic: - max: | - MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - avg: | - AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: | - MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - Write and Atomic BW: + - Read BW: max: | - MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + unit: Gbps avg: | - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - Remote Read Traffic: max: | MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) @@ -1065,14 +1407,21 @@ Modification: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) min: | MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - - Read BW: + - Remote Write and Atomic Traffic: max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | - AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) min: | - MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + - Write and Atomic BW: + max: | + MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: Gbps + avg: | + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - metric_table: id: 1703 title: L2 Cache Accesses @@ -1115,14 +1464,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml index 34eb6972cac..955a461c736 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml @@ -64,7 +64,7 @@ Panel Config: peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -217,6 +217,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -253,8 +258,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -339,3 +344,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml index 8349f4f8fd8..b393bdf3d51 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -167,8 +167,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml index 72d6adce5f5..88605b3020b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml @@ -8,18 +8,24 @@ Addition: id: 201 title: System Speed-of-Light metrics: - - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (F6F4): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP/s peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + - MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + metric_descriptions: + MFMA FLOPs (F8): + plain: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. This is also presented as a percent of the peak theoretical F8 MFMA operations achievable on the specific accelerator. It is supported on AMD Instinct MI300 series and later only. - Panel Config: id: 400 title: Roofline @@ -28,16 +34,27 @@ Addition: id: 401 title: Roofline Performance Rates metrics: - - MFMA FLOPs (F8): - value: | - AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF8Flops_empirical_peak - MFMA FLOPs (F6F4): value: | AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s peak: $MFMA_FLOPs_F6F4_empirical_peak + - MFMA FLOPs (F8): + value: | + AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $MFMAF8Flops_empirical_peak + metric_descriptions: + MFMA FLOPs (F6F4): + plain: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + rst: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + MFMA FLOPs (F8): + plain: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations from VALU instructions. The peak empirically measured F8 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI300 series and later only. - Panel Config: id: 500 title: Command Processor (CPC/CPF) @@ -46,6 +63,11 @@ Addition: id: 502 title: Command processor packet processor (CPC) metrics: + - CPC ADC Utilization: + avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + unit: pct - CPC CANE Stall Rate: avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) @@ -59,11 +81,6 @@ Addition: max: | MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) unit: pct - - CPC ADC Utilization: - avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - unit: pct - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -72,14 +89,6 @@ Addition: id: 601 title: Workgroup manager utilizations metrics: - - Scheduler-Pipe Wave Utilization: - avg: | - AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: | - MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - Schedule-Pipe Wave Occupancy: avg: | AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) @@ -88,6 +97,14 @@ Addition: max: | MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) unit: Wave + - Scheduler-Pipe Wave Utilization: + avg: | + AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + unit: Pct - metric_table: id: 602 title: Workgroup Manager - Resource Allocation @@ -117,16 +134,22 @@ Addition: id: 1004 title: MFMA Arithmetic Instruction Mix metrics: - - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - MFMA-F6F4: avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) unit: (instr + $normUnit) + - MFMA-F8: + avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) + min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) + max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) + unit: (instr + $normUnit) + metric_descriptions: + MFMA-F8: + plain: | + The total number of 8-bit floating point MFMA instructions issued per normalization unit. This is supported in AMD Instinct MI300 series and later only. + rst: | + The total number of 8-bit floating point MFMA instructions issued per normalization unit. This is supported in AMD Instinct MI300 series and later only. - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -135,18 +158,18 @@ Addition: id: 1101 title: Compute Speed-of-Light metrics: - - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (F6F4): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) + - MFMA FLOPs (F8): + value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) + unit: GFLOP + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - metric_table: id: 1102 title: Pipeline Statistics @@ -160,16 +183,16 @@ Addition: id: 1103 title: Arithmetic Operations metrics: - - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - F6F4 OPs: avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) unit: (OPs + $normUnit) + - F8 OPs: + avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) + unit: (OPs + $normUnit) - Panel Config: id: 1200 title: Local Data Share (LDS) @@ -178,73 +201,79 @@ Addition: id: 1202 title: LDS Statistics metrics: - - LDS LOAD: - avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) - min: MIN((SQ_INSTS_LDS_LOAD / $denom)) - max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + - LDS ATOMIC: + avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) + min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) + max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) unit: (instr + $normUnit) - - LDS LOAD Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + - LDS ATOMIC Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - LDS Command FIFO Full Rate: avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) - - LDS ATOMIC Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - - LDS ATOMIC: - avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) - min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) - max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) - unit: (instr + $normUnit) - - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - LDS Data FIFO Full Rate: avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) + - LDS LOAD: + avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) + min: MIN((SQ_INSTS_LDS_LOAD / $denom)) + max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + unit: (instr + $normUnit) + - LDS LOAD Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps - LDS STORE: avg: AVG((SQ_INSTS_LDS_STORE / $denom)) min: MIN((SQ_INSTS_LDS_STORE / $denom)) max: MAX((SQ_INSTS_LDS_STORE / $denom)) unit: (instr + $normUnit) + - LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) metric_tables: - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - metrics: - - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metric_table: id: 1502 title: Instruction counts metrics: + - Global/Generic Read Instructions for LDS: + avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) - Spill/Stack Read Instructions for LDS: avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - - Global/Generic Read Instructions for LDS: - avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + metrics: + - Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) + metric_descriptions: + Write Ack Instructions: + plain: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. + rst: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -259,12 +288,12 @@ Addition: - Stalled on Data: expr: | (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Read Return: - expr: | - (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Latency FIFO: expr: | (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Read Return: + expr: | + (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Request FIFO: expr: | (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) @@ -277,11 +306,6 @@ Addition: min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) unit: (Req + $normUnit) - - Tag RAM 3 Req: - avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) - unit: (Req + $normUnit) - Tag RAM 1 Req: avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) @@ -292,6 +316,11 @@ Addition: min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) unit: (Req + $normUnit) + - Tag RAM 3 Req: + avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) + unit: (Req + $normUnit) - metric_table: id: 1605 title: L1 Unified Translation Cache (UTCL1) @@ -310,35 +339,35 @@ Addition: id: 1606 title: L1D Addr Translation Stalls metrics: - - Resident Page Full Stall: - avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + - Cache Full Stall: + avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) units: (Cycles + $normUnit) - Cache Miss Stall: avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) units: (Cycles + $normUnit) - - Thrashing Stall: - avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + - Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + units: (Cycles + $normUnit) + - Resident Page Full Stall: + avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) units: (Cycles + $normUnit) - Serialization Stall: avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) units: (Cycles + $normUnit) - - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - units: (Cycles + $normUnit) - - Cache Full Stall: - avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + - Thrashing Stall: + avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) units: (Cycles + $normUnit) - UTCL2 Stall: avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) @@ -373,40 +402,35 @@ Addition: id: 1703 title: L2 Cache Accesses metrics: - - Read Bandwidth: - avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Atomic Bandwidth: + avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Bypasss Req: avg: AVG((TCC_BYPASS_REQ_sum / $denom)) min: MIN((TCC_BYPASS_REQ_sum / $denom)) max: MAX((TCC_BYPASS_REQ_sum / $denom)) unit: (Req + $normUnit) - - Write Bandwidth: - avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Atomic Bandwidth: - avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Input Buffer Req: avg: AVG((TCC_IB_REQ_sum / $denom)) min: MIN((TCC_IB_REQ_sum / $denom)) max: MAX((TCC_IB_REQ_sum / $denom)) unit: (Req + $normUnit) + - Read Bandwidth: + avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth: + avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - metric_table: id: 1704 title: L2 Cache Stalls metrics: - - Stalled on Write Data FIFO: - avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) - min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) - max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - Input Buffer Stalled on L2: avg: AVG(TCC_IB_STALL_sum / $denom) min: MIN(TCC_IB_STALL_sum / $denom) @@ -417,6 +441,11 @@ Addition: min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) unit: (Cycles + $normUnit) + - Stalled on Write Data FIFO: + avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) + min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) + max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) - metric_table: id: 1705 title: L2 - Fabric Interface stalls @@ -431,25 +460,25 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - PCIe Stall: - type: PCIe Stall - transaction: Write + - Read - Infinity Fabric Stall: + type: Infinity Fabric™ Stall + transaction: Read avg: | - AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - Infinity Fabric Stall: - type: Infinity Fabric™ Stall - transaction: Write - avg: | - AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + - Read - PCIe Stall: + type: PCIe Stall + transaction: Read + avg: | + AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - Write - HBM Stall: type: HBM Stall @@ -461,54 +490,59 @@ Addition: max: | MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Read - PCIe Stall: - type: PCIe Stall - transaction: Read + - Write - Infinity Fabric Stall: + type: Infinity Fabric™ Stall + transaction: Write avg: | - AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Read - Infinity Fabric Stall: - type: Infinity Fabric™ Stall - transaction: Read + - Write - PCIe Stall: + type: PCIe Stall + transaction: Write avg: | - AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Read Bandwidth - HBM: - avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Atomic - HBM: + avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) + unit: (Req + $normUnit) + - Atomic Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Atomic Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Atomic Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read (128B): avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) unit: (Req + $normUnit) - - Write Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Read Bandwidth - HBM: + avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Read Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -520,26 +554,122 @@ Addition: min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Write Bandwidth - HBM: avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic - HBM: - avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - unit: (Req + $normUnit) - - Atomic Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Write Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + metric_descriptions: + Atomic Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + Atomic Bandwidth - HBM: + plain: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + Atomic Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + Atomic Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + Read - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + Read - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Read - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Read Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + Read Bandwidth - HBM: + plain: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + Read Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + Read Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + Read Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + Write - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + Write - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Write - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Write Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + Write Bandwidth - HBM: + plain: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + Write Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + Write Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + Write Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. Deletion: [] @@ -553,51 +683,51 @@ Modification: id: 201 title: System Speed-of-Light metrics: - - L2-Fabric Write BW: + - L2 Cache BW: + peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) pop: | - ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) + ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) + - L2-Fabric Read BW: value: | - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - - VALU Active Threads: + AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) pop: | - (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) - peak: $wave_size + ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Read Latency: value: | AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - - L2 Cache BW: - pop: | - ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - - MFMA FLOPs (F64): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - - vL1D Cache BW: + - L2-Fabric Write BW: + value: | + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) pop: | - ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write Latency: value: | AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - MFMA IOPs (Int8): + - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - - MFMA FLOPs (BF16): + - MFMA FLOPs (F64): + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - - L2-Fabric Read BW: + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) + - MFMA IOPs (Int8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth) - value: | - AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + - VALU Active Threads: + peak: $wave_size + pop: | + (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None)) + - vL1D Cache BW: + peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) + value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + pop: | + ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - Panel Config: id: 300 title: Memory Chart @@ -606,30 +736,30 @@ Modification: id: 301 title: Memory Chart metrics: - - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - - Wavefronts: - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Atomic Lat: value: | ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else 0)), 0) - - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - - Workgroups: - value: | - ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric Rd Lat: value: | ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else 0)), 0) - Fabric Wr Lat: value: | ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else 0)), 0) + - Fabric_L2 Atomic: + value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) + - Fabric_L2 Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) + - Fabric_L2 Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) + - HBM Rd: + value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) + - HBM Wr: + value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) + - Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) + - Workgroups: + value: | + ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - Panel Config: id: 400 title: Roofline @@ -645,18 +775,18 @@ Modification: id: 402 title: Roofline Plot Points metrics: + - AI HBM: + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + - AI L1: + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - AI L2: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) - Performance (GFLOPs): value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 - - AI L1: - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - - AI HBM: - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -665,38 +795,38 @@ Modification: id: 601 title: Workgroup manager utilizations metrics: - - VGPR Writes: - min: | - MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - avg: | - AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + - Dispatched Wavefronts: + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: - min: | - MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) avg: | AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + min: | + MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) max: | MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: - min: | - MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - - Dispatched Wavefronts: - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Scheduler-Pipe Utilization: - min: | - MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) max: | MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + - VGPR Writes: + avg: | + AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -706,8 +836,8 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1000 @@ -718,8 +848,8 @@ Modification: title: Overall Instruction Mix metrics: - VMEM: - min: MIN(((SQ_INSTS_VMEM) / $denom)) avg: AVG(((SQ_INSTS_VMEM) / $denom)) + min: MIN(((SQ_INSTS_VMEM) / $denom)) max: MAX(((SQ_INSTS_VMEM) / $denom)) - Panel Config: id: 1100 @@ -729,31 +859,31 @@ Modification: id: 1101 title: Compute Speed-of-Light metrics: - - MFMA IOPs (INT8): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F64): + peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - - MFMA FLOPs (F16): + - MFMA IOPs (INT8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - metric_table: id: 1103 title: Arithmetic Operations metrics: - FLOPs (Total): - min: | - MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + min: | + MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) max: | MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: @@ -781,32 +911,32 @@ Modification: id: 1603 title: vL1D cache access metrics metrics: - - L1-L2 Write Latency: - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + - Cache BW: + avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) + max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - L1 Access Latency: - min: MIN((TCP_TCP_LATENCY_sum / $denom)) avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) max: MAX((TCP_TCP_LATENCY_sum / $denom)) - - L1-L2 Read Latency: - min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - - Cache BW: - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - L1-L2 BW: - min: | - MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) avg: | AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) max: | MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) + - L1-L2 Read Latency: + avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + - L1-L2 Write Latency: + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -815,149 +945,149 @@ Modification: id: 1701 title: L2 Speed-of-Light metrics: - - L2-Fabric Write and Atomic BW: - value: | - AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - L2-Fabric Read BW: value: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + - L2-Fabric Write and Atomic BW: + value: | + AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) - metric_table: id: 1702 title: L2-Fabric interface metrics metrics: - - Write and Atomic Latency: - min: | - MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + - Atomic Latency: avg: | - AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: | - MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - Read BW: + AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) min: | - MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - avg: | - AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - - Remote Read Traffic: - min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) + - Atomic Traffic: avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: | + MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - HBM Read Traffic: - min: | - MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: | + MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - - Uncached Read Traffic: - min: | - MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + - HBM Write and Atomic Traffic: avg: | - AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: | - MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - - Uncached Write and Atomic Traffic: + AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) min: | - MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - avg: | - AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | - MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - Atomic Traffic: - min: | - MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + - Read BW: avg: | - AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) max: | - MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - Atomic Latency: + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + - Read Latency: + avg: | + AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) min: | - MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) + MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: | + MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + - Remote Read Traffic: avg: | - AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) + AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + min: | + MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) max: | - MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None)) + MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Remote Write and Atomic Traffic: - min: | - MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) avg: | AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: | + MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - HBM Write and Atomic Traffic: - min: | - MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + - Uncached Read Traffic: avg: | - AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: | - MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - - Read Latency: + AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) min: | - MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: | + MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + - Uncached Write and Atomic Traffic: avg: | - AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: | + MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) max: | - MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - Write and Atomic BW: - min: | - MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) avg: | AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) unit: Gbps max: | MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))) + - Write and Atomic Latency: + avg: | + AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + min: | + MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) + max: | + MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - HBM Write and Atomic: - min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Atomic: - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) + min: MIN((TCC_EA0_ATOMIC_sum / $denom)) max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - - Write and Atomic (32B): - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + - HBM Read: + avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + - HBM Write and Atomic: + avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (32B): - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) + - Read (64B): + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - Read (Uncached): - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - Remote Read: - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - Remote Write and Atomic: - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) + min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - - Write and Atomic (Uncached): - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - - Read (64B): - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - - HBM Read: - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) + - Write and Atomic (32B): + avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) + max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - Write and Atomic (64B): - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) + - Write and Atomic (Uncached): + avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) + max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -967,22 +1097,22 @@ Modification: title: Aggregate Stats (All channels) metrics: - L2 Cache Hit Rate: - min: | - MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) avg: | AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: | - MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + min: | + MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) std dev: | STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) + max: | + MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - metric_table: id: 1805 title: L2-Fabric Requests (per normUnit) metrics: - ::_1: write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) + read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - metric_table: id: 1806 title: L2-Fabric Read Latency (Cycles) @@ -1018,5 +1148,5 @@ Modification: metrics: - ::_1: ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml index 5f76eb89372..f1c48cef170 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml @@ -70,7 +70,7 @@ Panel Config: peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -226,6 +226,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -267,8 +272,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -353,3 +358,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml index 81ce3c2e684..23464a9d469 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -159,8 +159,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml index d4c0cb307a2..43191e179d7 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml @@ -22,12 +22,23 @@ Addition: id: 301 title: Memory Chart metrics: - - L2 Wr Lat: - value: | - ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) - L2 Rd Lat: value: | ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), 0) + - L2 Wr Lat: + value: | + ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + metric_descriptions: + L2 Rd Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L2 Wr Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 400 title: Roofline @@ -41,6 +52,12 @@ Addition: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s peak: $MFMA_FLOPs_F6F4_empirical_peak + metric_descriptions: + MFMA FLOPs (F6F4): + plain: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + rst: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. - Panel Config: id: 500 title: Command Processor (CPC/CPF) @@ -49,6 +66,16 @@ Addition: id: 502 title: Command processor packet processor (CPC) metrics: + - CPC ADC Utilization: + avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) + unit: pct + - CPC CANE Stall Rate: + avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + unit: pct - CPC SYNC FIFO Full Rate: avg: | AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) @@ -57,16 +84,6 @@ Addition: max: | MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) unit: pct - - CPC CANE Stall Rate: - avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - unit: pct - - CPC ADC Utilization: - avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) - unit: pct - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -75,14 +92,6 @@ Addition: id: 601 title: Workgroup manager utilizations metrics: - - Scheduler-Pipe Wave Utilization: - avg: | - AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: | - MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - Schedule-Pipe Wave Occupancy: avg: | AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) @@ -91,6 +100,14 @@ Addition: max: | MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) unit: Wave + - Scheduler-Pipe Wave Utilization: + avg: | + AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + unit: Pct - metric_table: id: 602 title: Workgroup Manager - Resource Allocation @@ -165,73 +182,79 @@ Addition: id: 1202 title: LDS Statistics metrics: - - LDS STORE: - avg: AVG((SQ_INSTS_LDS_STORE / $denom)) - min: MIN((SQ_INSTS_LDS_STORE / $denom)) - max: MAX((SQ_INSTS_LDS_STORE / $denom)) - unit: (instr + $normUnit) - - LDS LOAD Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - LDS ATOMIC: avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) unit: (instr + $normUnit) - - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + - LDS ATOMIC Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - LDS Command FIFO Full Rate: avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) unit: (Cycles + $normUnit) + - LDS Data FIFO Full Rate: + avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) - LDS LOAD: avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) min: MIN((SQ_INSTS_LDS_LOAD / $denom)) max: MAX((SQ_INSTS_LDS_LOAD / $denom)) unit: (instr + $normUnit) - - LDS ATOMIC Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + - LDS LOAD Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps + - LDS STORE: + avg: AVG((SQ_INSTS_LDS_STORE / $denom)) + min: MIN((SQ_INSTS_LDS_STORE / $denom)) + max: MAX((SQ_INSTS_LDS_STORE / $denom)) + unit: (instr + $normUnit) + - LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - - LDS Data FIFO Full Rate: - avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) metric_tables: - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - metrics: - - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metric_table: id: 1502 title: Instruction counts metrics: + - Global/Generic Read Instructions for LDS: + avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) - Spill/Stack Read Instructions for LDS: avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - - Global/Generic Read Instructions for LDS: - avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + metrics: + - Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) + metric_descriptions: + Write Ack Instructions: + plain: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. + rst: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -243,110 +266,126 @@ Addition: - Stalled on Address: expr: | (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Read Return: - expr: | - (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Request FIFO: - expr: | - (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Data: expr: | (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Latency FIFO: expr: | (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Read Return: + expr: | + (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Request FIFO: + expr: | + (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - metric_table: id: 1603 title: vL1D cache access metrics metrics: - - Tag RAM 3 Req: - avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) - unit: (Req + $normUnit) + - L1 Access Latency: + avg: AVG((TCP_TCP_LATENCY_sum / $denom)) + min: MIN((TCP_TCP_LATENCY_sum / $denom)) + max: MAX((TCP_TCP_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) - L1-L2 Read Latency: avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) - - Tag RAM 2 Req: - avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) - unit: (Req + $normUnit) - - Tag RAM 0 Req: - avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write Latency: avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) - - L1 Access Latency: - avg: AVG((TCP_TCP_LATENCY_sum / $denom)) - min: MIN((TCP_TCP_LATENCY_sum / $denom)) - max: MAX((TCP_TCP_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) + - Tag RAM 0 Req: + avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) + unit: (Req + $normUnit) - Tag RAM 1 Req: avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) unit: (Req + $normUnit) + - Tag RAM 2 Req: + avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Tag RAM 3 Req: + avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) + unit: (Req + $normUnit) - metric_table: id: 1605 title: L1 Unified Translation Cache (UTCL1) metrics: - - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - Inflight Req: avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) units: (Req + $normUnit) + - Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) - metric_table: id: 1606 title: L1D Addr Translation Stalls metrics: - - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - units: (Cycles + $normUnit) - - Serialization Stall: - avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - units: (Cycles + $normUnit) - Cache Full Stall: avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) units: (Cycles + $normUnit) - - UTCL2 Stall: - avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - units: (Cycles + $normUnit) - Cache Miss Stall: avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) units: (Cycles + $normUnit) + - Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + units: (Cycles + $normUnit) - Resident Page Full Stall: avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) units: (Cycles + $normUnit) + - Serialization Stall: + avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + units: (Cycles + $normUnit) - Thrashing Stall: avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) units: (Cycles + $normUnit) + - UTCL2 Stall: + avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + units: (Cycles + $normUnit) + metric_descriptions: + L1 Access Latency: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L1-L2 Write Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 1700 title: L2 Cache @@ -355,14 +394,6 @@ Addition: id: 1702 title: L2-Fabric interface metrics metrics: - - Write Stall: - avg: | - AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read Stall: avg: | AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) @@ -371,35 +402,43 @@ Addition: max: | MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct + - Write Stall: + avg: | + AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + unit: pct - metric_table: id: 1703 title: L2 Cache Accesses metrics: - - Input Buffer Req: - avg: AVG((TCC_IB_REQ_sum / $denom)) - min: MIN((TCC_IB_REQ_sum / $denom)) - max: MAX((TCC_IB_REQ_sum / $denom)) - unit: (Req + $normUnit) + - Atomic Bandwidth: + avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Bypasss Req: avg: AVG((TCC_BYPASS_REQ_sum / $denom)) min: MIN((TCC_BYPASS_REQ_sum / $denom)) max: MAX((TCC_BYPASS_REQ_sum / $denom)) unit: (Req + $normUnit) - - Atomic Bandwidth: - avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Input Buffer Req: + avg: AVG((TCC_IB_REQ_sum / $denom)) + min: MIN((TCC_IB_REQ_sum / $denom)) + max: MAX((TCC_IB_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Read Bandwidth: + avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Write Bandwidth: avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Read Bandwidth: - avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - metric_table: id: 1704 title: L2 Cache Stalls @@ -423,16 +462,6 @@ Addition: id: 1705 title: L2 - Fabric Interface stalls metrics: - - Write - HBM Stall: - type: HBM Stall - transaction: Write - avg: | - AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read - HBM Stall: type: HBM Stall transaction: Read @@ -443,15 +472,35 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - PCIe Stall: + - Read - Infinity Fabric Stall: + type: Infinity Fabric™ Stall + transaction: Read + avg: | + AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + - Read - PCIe Stall: type: PCIe Stall + transaction: Read + avg: | + AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + - Write - HBM Stall: + type: HBM Stall transaction: Write avg: | - AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - Write - Infinity Fabric Stall: type: Infinity Fabric™ Stall @@ -463,65 +512,35 @@ Addition: max: | MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Read - Infinity Fabric Stall: - type: Infinity Fabric™ Stall - transaction: Read - avg: | - AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - - Read - PCIe Stall: + - Write - PCIe Stall: type: PCIe Stall - transaction: Read + transaction: Write avg: | - AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Read Bandwidth - PCIe: - avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Write Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Atomic Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic - HBM: avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) unit: (Req + $normUnit) - - Read Bandwidth - HBM: - avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Atomic Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Atomic Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) @@ -532,16 +551,137 @@ Addition: min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) unit: (Req + $normUnit) + - Read Bandwidth - HBM: + avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Read Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Read Bandwidth - PCIe: + avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Write Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + metric_descriptions: + Atomic Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + Atomic Bandwidth - HBM: + plain: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + Atomic Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + Atomic Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + Read - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + Read - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Read - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Read Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + Read Bandwidth - HBM: + plain: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + Read Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + Read Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + Read Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + Write - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + Write - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Write - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Write Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + Write Bandwidth - HBM: + plain: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + Write Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + Write Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + Write Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. Deletion: [] @@ -555,26 +695,26 @@ Modification: id: 201 title: System Speed-of-Light metrics: - - MFMA FLOPs (F8): + - MFMA FLOPs (BF16): pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F16): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F64): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) + - MFMA FLOPs (F8): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - MFMA IOPs (Int8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - - MFMA FLOPs (F16): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - - MFMA FLOPs (BF16): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - Panel Config: id: 300 title: Memory Chart @@ -596,18 +736,18 @@ Modification: id: 402 title: Roofline Plot Points metrics: - - Performance (GFLOPs): - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 - - AI L2: + - AI HBM: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - - AI HBM: + - AI L2: value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) + - Performance (GFLOPs): + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -616,38 +756,38 @@ Modification: id: 601 title: Workgroup manager utilizations metrics: - - SGPR Writes: - max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - avg: | - AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Dispatched Wavefronts: max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: max: | MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + avg: | + AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + - SGPR Writes: + max: | + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | - AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Scheduler-Pipe Utilization: max: | MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - VGPR Writes: max: | MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + min: | + MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -658,8 +798,8 @@ Modification: metrics: - Total Wavefronts: max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -668,6 +808,10 @@ Modification: id: 1101 title: Compute Speed-of-Light metrics: + - MFMA FLOPs (BF16): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F16): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) @@ -676,18 +820,14 @@ Modification: pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - - MFMA IOPs (INT8): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - - MFMA FLOPs (BF16): - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - MFMA FLOPs (F8): pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA IOPs (INT8): + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - metric_table: id: 1103 title: Arithmetic Operations @@ -695,10 +835,10 @@ Modification: - FLOPs (Total): max: | MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - min: | - MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) avg: | AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + min: | + MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -717,22 +857,22 @@ Modification: - Read BW: max: | MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - min: | - MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) avg: | AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Read (64B): - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - HBM Write and Atomic: max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + - Read (64B): + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -750,6 +890,6 @@ Modification: title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml index d0efd3e600b..f2d00cecf19 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml @@ -70,7 +70,7 @@ Panel Config: peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -226,6 +226,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -267,8 +272,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -353,3 +358,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml index 81ce3c2e684..23464a9d469 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -159,8 +159,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml index 5d64c7a5e07..ed60c4b48f7 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml @@ -28,6 +28,17 @@ Addition: - L2 Wr Lat: value: | ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + metric_descriptions: + L2 Rd Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L2 Wr Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 400 title: Roofline @@ -41,6 +52,12 @@ Addition: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s peak: $MFMA_FLOPs_F6F4_empirical_peak + metric_descriptions: + MFMA FLOPs (F6F4): + plain: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + rst: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. - Panel Config: id: 500 title: Command Processor (CPC/CPF) @@ -49,14 +66,6 @@ Addition: id: 502 title: Command processor packet processor (CPC) metrics: - - CPC SYNC FIFO Full Rate: - avg: | - AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - min: | - MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - max: | - MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) - unit: pct - CPC ADC Utilization: avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) @@ -67,6 +76,14 @@ Addition: min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) unit: pct + - CPC SYNC FIFO Full Rate: + avg: | + AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) + min: | + MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) + max: | + MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) + unit: pct - Panel Config: id: 600 title: Workgroup Manager (SPI) @@ -165,16 +182,36 @@ Addition: id: 1202 title: LDS Statistics metrics: + - LDS ATOMIC: + avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) + min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) + max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) + unit: (instr + $normUnit) - LDS ATOMIC Bandwidth: avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps + - LDS Command FIFO Full Rate: + avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - LDS Data FIFO Full Rate: + avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) - LDS LOAD: avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) min: MIN((SQ_INSTS_LDS_LOAD / $denom)) max: MAX((SQ_INSTS_LDS_LOAD / $denom)) unit: (instr + $normUnit) + - LDS LOAD Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps - LDS STORE: avg: AVG((SQ_INSTS_LDS_STORE / $denom)) min: MIN((SQ_INSTS_LDS_STORE / $denom)) @@ -185,39 +222,10 @@ Addition: min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - - LDS LOAD Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - - LDS Command FIFO Full Rate: - avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - - LDS ATOMIC: - avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) - min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) - max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) - unit: (instr + $normUnit) - - LDS Data FIFO Full Rate: - avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) metric_tables: - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - metrics: - - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metric_table: id: 1502 title: Instruction counts @@ -232,6 +240,21 @@ Addition: min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + metrics: + - Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + unit: (Instructions + $normUnit) + metric_descriptions: + Write Ack Instructions: + plain: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. + rst: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -240,45 +263,25 @@ Addition: id: 1602 title: vL1D cache stall metrics metrics: - - Stalled on Request FIFO: + - Stalled on Address: expr: | - (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Data: + expr: | + (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Latency FIFO: expr: | (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Address: - expr: | - (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Read Return: expr: | (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Data: + - Stalled on Request FIFO: expr: | - (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - metric_table: id: 1603 title: vL1D cache access metrics metrics: - - Tag RAM 2 Req: - avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) - unit: (Req + $normUnit) - - Tag RAM 0 Req: - avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) - unit: (Req + $normUnit) - - Tag RAM 3 Req: - avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) - unit: (Req + $normUnit) - - Tag RAM 1 Req: - avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1 Access Latency: avg: AVG((TCP_TCP_LATENCY_sum / $denom)) min: MIN((TCP_TCP_LATENCY_sum / $denom)) @@ -294,59 +297,95 @@ Addition: min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) + - Tag RAM 0 Req: + avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Tag RAM 1 Req: + avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Tag RAM 2 Req: + avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) + unit: (Req + $normUnit) + - Tag RAM 3 Req: + avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) + min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) + max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) + unit: (Req + $normUnit) - metric_table: id: 1605 title: L1 Unified Translation Cache (UTCL1) metrics: - - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - Inflight Req: avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) units: (Req + $normUnit) + - Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) - metric_table: id: 1606 title: L1D Addr Translation Stalls metrics: - - Serialization Stall: - avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - units: (Cycles + $normUnit) - Cache Full Stall: avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) units: (Cycles + $normUnit) - - Resident Page Full Stall: - avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - units: (Cycles + $normUnit) - - UTCL2 Stall: - avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + - Cache Miss Stall: + avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) units: (Cycles + $normUnit) - Latency FIFO Stall: avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) units: (Cycles + $normUnit) + - Resident Page Full Stall: + avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) + units: (Cycles + $normUnit) + - Serialization Stall: + avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) + units: (Cycles + $normUnit) - Thrashing Stall: avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) units: (Cycles + $normUnit) - - Cache Miss Stall: - avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) + - UTCL2 Stall: + avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) units: (Cycles + $normUnit) + metric_descriptions: + L1 Access Latency: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L1-L2 Write Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 1700 title: L2 Cache @@ -380,26 +419,26 @@ Addition: min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Bypasss Req: + avg: AVG((TCC_BYPASS_REQ_sum / $denom)) + min: MIN((TCC_BYPASS_REQ_sum / $denom)) + max: MAX((TCC_BYPASS_REQ_sum / $denom)) + unit: (Req + $normUnit) - Input Buffer Req: avg: AVG((TCC_IB_REQ_sum / $denom)) min: MIN((TCC_IB_REQ_sum / $denom)) max: MAX((TCC_IB_REQ_sum / $denom)) unit: (Req + $normUnit) - - Write Bandwidth: - avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read Bandwidth: avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Bypasss Req: - avg: AVG((TCC_BYPASS_REQ_sum / $denom)) - min: MIN((TCC_BYPASS_REQ_sum / $denom)) - max: MAX((TCC_BYPASS_REQ_sum / $denom)) - unit: (Req + $normUnit) + - Write Bandwidth: + avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - metric_table: id: 1704 title: L2 Cache Stalls @@ -443,16 +482,6 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - PCIe Stall: - type: PCIe Stall - transaction: Write - avg: | - AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read - PCIe Stall: type: PCIe Stall transaction: Read @@ -463,6 +492,16 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct + - Write - HBM Stall: + type: HBM Stall + transaction: Write + avg: | + AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct - Write - Infinity Fabric Stall: type: Infinity Fabric™ Stall transaction: Write @@ -473,75 +512,176 @@ Addition: max: | MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - HBM Stall: - type: HBM Stall + - Write - PCIe Stall: + type: PCIe Stall transaction: Write avg: | - AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Read (128B): - avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) - unit: (Req + $normUnit) - Atomic - HBM: avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) unit: (Req + $normUnit) - - Read Bandwidth - PCIe: - avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth - HBM: avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Atomic Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Atomic Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Read (128B): + avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) + min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) + unit: (Req + $normUnit) + - Read Bandwidth - HBM: + avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Read Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Read Bandwidth - PCIe: + avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - Write Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Read Bandwidth - HBM: - avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + metric_descriptions: + Atomic Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + Atomic Bandwidth - HBM: + plain: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + Atomic Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + Atomic Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + Read - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + Read - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Read - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Read Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + Read Bandwidth - HBM: + plain: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + Read Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + Read Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + Read Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + Write - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + Write - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Write - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Write Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + Write Bandwidth - HBM: + plain: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + Write Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + Write Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + Write Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. Deletion: [] @@ -555,27 +695,27 @@ Modification: id: 201 title: System Speed-of-Light metrics: - - MFMA IOPs (Int8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - unit: GFLOP/s - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (F64): peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F8): + unit: GFLOP/s + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + - MFMA IOPs (Int8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - Panel Config: id: 300 title: Memory Chart @@ -584,11 +724,11 @@ Modification: id: 301 title: Memory Chart metrics: + - Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - Workgroups: value: | ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - - Wavefronts: - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - Panel Config: id: 400 title: Roofline @@ -597,15 +737,15 @@ Modification: id: 402 title: Roofline Plot Points metrics: - - AI L2: - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) - AI HBM: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) - AI L1: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) + - AI L2: + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) - Performance (GFLOPs): value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 @@ -617,38 +757,38 @@ Modification: id: 601 title: Workgroup manager utilizations metrics: + - Dispatched Wavefronts: + min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: - max: | - MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - avg: | - AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - - VGPR Writes: - max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) avg: | - AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: | - MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - - Scheduler-Pipe Utilization: + AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) max: | - MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - avg: | - AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - SGPR Writes: + min: | + MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + avg: | + AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + - Scheduler-Pipe Utilization: + min: | + MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) avg: | - AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + - VGPR Writes: min: | - MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - - Dispatched Wavefronts: - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + avg: | + AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -658,9 +798,9 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -669,37 +809,37 @@ Modification: id: 1101 title: Compute Speed-of-Light metrics: - - MFMA FLOPs (F8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA FLOPs (BF16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F64): peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - - MFMA FLOPs (BF16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA FLOPs (F8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA IOPs (INT8): peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA FLOPs (F16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - metric_table: id: 1103 title: Arithmetic Operations metrics: - FLOPs (Total): - max: | - MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - avg: | - AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) min: | MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + avg: | + AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + max: | + MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -716,31 +856,31 @@ Modification: title: L2-Fabric interface metrics metrics: - Read BW: - max: | - MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - avg: | - AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) min: | MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - - Remote Read Traffic: - max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + max: | + MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + - Remote Read Traffic: min: | MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + avg: | + AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: | + MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - HBM Write and Atomic: - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - Read (64B): - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -750,14 +890,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml index 5f76eb89372..f1c48cef170 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml @@ -70,7 +70,7 @@ Panel Config: peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -226,6 +226,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -267,8 +272,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -353,3 +358,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml index 81ce3c2e684..23464a9d469 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -159,8 +159,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml index 72a6aed7fe6..b1786d09832 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml @@ -22,12 +22,23 @@ Addition: id: 301 title: Memory Chart metrics: - - L2 Wr Lat: - value: | - ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) - L2 Rd Lat: value: | ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), 0) + - L2 Wr Lat: + value: | + ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) + metric_descriptions: + L2 Rd Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L2 Wr Lat: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 400 title: Roofline @@ -41,6 +52,12 @@ Addition: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s peak: $MFMA_FLOPs_F6F4_empirical_peak + metric_descriptions: + MFMA FLOPs (F6F4): + plain: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. + rst: | + The total number of 4-bit and 6-bit floating point MFMA operations executed per second. Note: this does not include any floating point operations from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable on the specific accelerator is displayed alongside for comparison. It is supported on AMD Instinct MI350 series (gfx950) and later only. - Panel Config: id: 500 title: Command Processor (CPC/CPF) @@ -49,16 +66,16 @@ Addition: id: 502 title: Command processor packet processor (CPC) metrics: - - CPC CANE Stall Rate: - avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) - unit: pct - CPC ADC Utilization: avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None) unit: pct + - CPC CANE Stall Rate: + avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None) + unit: pct - CPC SYNC FIFO Full Rate: avg: | AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None) @@ -75,14 +92,6 @@ Addition: id: 601 title: Workgroup manager utilizations metrics: - - Scheduler-Pipe Wave Utilization: - avg: | - AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: | - MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: | - MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - Schedule-Pipe Wave Occupancy: avg: | AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) @@ -91,6 +100,14 @@ Addition: max: | MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY) unit: Wave + - Scheduler-Pipe Wave Utilization: + avg: | + AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + min: | + MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + max: | + MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + unit: Pct - metric_table: id: 602 title: Workgroup Manager - Resource Allocation @@ -165,73 +182,79 @@ Addition: id: 1202 title: LDS Statistics metrics: - - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - - LDS STORE: - avg: AVG((SQ_INSTS_LDS_STORE / $denom)) - min: MIN((SQ_INSTS_LDS_STORE / $denom)) - max: MAX((SQ_INSTS_LDS_STORE / $denom)) - unit: (instr + $normUnit) - - LDS Data FIFO Full Rate: - avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - LDS ATOMIC: avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) unit: (instr + $normUnit) - - LDS Command FIFO Full Rate: - avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - LDS ATOMIC Bandwidth: avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps + - LDS Command FIFO Full Rate: + avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - LDS Data FIFO Full Rate: + avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) + min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) + max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) + unit: (Cycles + $normUnit) + - LDS LOAD: + avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) + min: MIN((SQ_INSTS_LDS_LOAD / $denom)) + max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + unit: (instr + $normUnit) - LDS LOAD Bandwidth: avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) units: Gbps - - LDS LOAD: - avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) - min: MIN((SQ_INSTS_LDS_LOAD / $denom)) - max: MAX((SQ_INSTS_LDS_LOAD / $denom)) + - LDS STORE: + avg: AVG((SQ_INSTS_LDS_STORE / $denom)) + min: MIN((SQ_INSTS_LDS_STORE / $denom)) + max: MAX((SQ_INSTS_LDS_STORE / $denom)) unit: (instr + $normUnit) + - LDS STORE Bandwidth: + avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) + units: Gbps - Panel Config: id: 1500 title: Address Processing Unit and Data Return Path (TA/TD) metric_tables: - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - metrics: - - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metric_table: id: 1502 title: Instruction counts metrics: + - Global/Generic Read Instructions for LDS: + avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + unit: (Instructions + $normUnit) - Spill/Stack Read Instructions for LDS: avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) unit: (Instructions + $normUnit) - - Global/Generic Read Instructions for LDS: - avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) + - metric_table: + id: 1504 + title: Vector L1 data-return path or Texture Data (TD) + metrics: + - Write Ack Instructions: + avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) + max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) unit: (Instructions + $normUnit) + metric_descriptions: + Write Ack Instructions: + plain: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. + rst: | + The total number of write acknowledgements submitted by data-return unit to SQ, summed over all compute units on the accelerator, per normalization unit. - Panel Config: id: 1600 title: Vector L1 Data Cache @@ -240,12 +263,6 @@ Addition: id: 1602 title: vL1D cache stall metrics metrics: - - Stalled on Request FIFO: - expr: | - (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - - Stalled on Read Return: - expr: | - (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - Stalled on Address: expr: | (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) @@ -255,6 +272,12 @@ Addition: - Stalled on Latency FIFO: expr: | (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Read Return: + expr: | + (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) + - Stalled on Request FIFO: + expr: | + (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None) - metric_table: id: 1603 title: vL1D cache access metrics @@ -264,6 +287,16 @@ Addition: min: MIN((TCP_TCP_LATENCY_sum / $denom)) max: MAX((TCP_TCP_LATENCY_sum / $denom)) unit: (Cycles + $normUnit) + - L1-L2 Read Latency: + avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) + - L1-L2 Write Latency: + avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) + unit: (Cycles + $normUnit) - Tag RAM 0 Req: avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) @@ -274,21 +307,11 @@ Addition: min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) unit: (Req + $normUnit) - - L1-L2 Write Latency: - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - Tag RAM 2 Req: avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) unit: (Req + $normUnit) - - L1-L2 Read Latency: - avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - Tag RAM 3 Req: avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) @@ -298,34 +321,34 @@ Addition: id: 1605 title: L1 Unified Translation Cache (UTCL1) metrics: - - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - Inflight Req: avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) units: (Req + $normUnit) + - Misses under Translation Miss: + avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) + units: (Req + $normUnit) - metric_table: id: 1606 title: L1D Addr Translation Stalls metrics: + - Cache Full Stall: + avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + units: (Cycles + $normUnit) - Cache Miss Stall: avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) units: (Cycles + $normUnit) - - Thrashing Stall: - avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - units: (Cycles + $normUnit) - - Cache Full Stall: - avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) + - Latency FIFO Stall: + avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) units: (Cycles + $normUnit) - Resident Page Full Stall: avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) @@ -337,16 +360,32 @@ Addition: min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) units: (Cycles + $normUnit) - - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) + - Thrashing Stall: + avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) + max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) units: (Cycles + $normUnit) - UTCL2 Stall: avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) units: (Cycles + $normUnit) + metric_descriptions: + L1 Access Latency: + plain: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + rst: | + Calculated as the average number of cycles that a vL1D cache line request spent in the vL1D cache pipeline. + L1-L2 Read Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive read requests from the L2 Cache. This number also includes requests for atomics with return values. + L1-L2 Write Latency: + plain: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. + rst: | + Calculated as the average number of cycles that the vL1D cache took to issue and receive acknowledgement of a write request to the L2 Cache. This number also includes requests for atomics without return values. - Panel Config: id: 1700 title: L2 Cache @@ -355,14 +394,6 @@ Addition: id: 1702 title: L2-Fabric interface metrics metrics: - - Write Stall: - avg: | - AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read Stall: avg: | AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) @@ -371,44 +402,47 @@ Addition: max: | MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) unit: pct + - Write Stall: + avg: | + AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None)) + unit: pct - metric_table: id: 1703 title: L2 Cache Accesses metrics: + - Atomic Bandwidth: + avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Bypasss Req: avg: AVG((TCC_BYPASS_REQ_sum / $denom)) min: MIN((TCC_BYPASS_REQ_sum / $denom)) max: MAX((TCC_BYPASS_REQ_sum / $denom)) unit: (Req + $normUnit) - - Write Bandwidth: - avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Input Buffer Req: avg: AVG((TCC_IB_REQ_sum / $denom)) min: MIN((TCC_IB_REQ_sum / $denom)) max: MAX((TCC_IB_REQ_sum / $denom)) unit: (Req + $normUnit) - - Atomic Bandwidth: - avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read Bandwidth: avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Write Bandwidth: + avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - metric_table: id: 1704 title: L2 Cache Stalls metrics: - - Stalled on Write Data FIFO: - avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) - min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) - max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - Input Buffer Stalled on L2: avg: AVG(TCC_IB_STALL_sum / $denom) min: MIN(TCC_IB_STALL_sum / $denom) @@ -419,6 +453,11 @@ Addition: min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) unit: (Cycles + $normUnit) + - Stalled on Write Data FIFO: + avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) + min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) + max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) + unit: (Cycles + $normUnit) - metric_table: id: 1705 title: L2 - Fabric Interface stalls @@ -433,16 +472,6 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Read - PCIe Stall: - type: PCIe Stall - transaction: Read - avg: | - AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - min: | - MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - max: | - MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read - Infinity Fabric Stall: type: Infinity Fabric™ Stall transaction: Read @@ -453,15 +482,25 @@ Addition: max: | MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - PCIe Stall: + - Read - PCIe Stall: type: PCIe Stall + transaction: Read + avg: | + AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + min: | + MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + max: | + MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + unit: pct + - Write - HBM Stall: + type: HBM Stall transaction: Write avg: | - AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - Write - Infinity Fabric Stall: type: Infinity Fabric™ Stall @@ -473,70 +512,171 @@ Addition: max: | MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - - Write - HBM Stall: - type: HBM Stall + - Write - PCIe Stall: + type: PCIe Stall transaction: Write avg: | - AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) min: | - MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) max: | - MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) + MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None)) unit: pct - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Read Bandwidth - PCIe: - avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Read Bandwidth - HBM: - avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - - Write Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic - HBM: avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) unit: (Req + $normUnit) - - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth - HBM: avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Atomic Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Atomic Bandwidth - PCIe: avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + - Read Bandwidth - HBM: + avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps - Read Bandwidth - Infinity Fabric™: avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps - - Atomic Bandwidth - Infinity Fabric™: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + - Read Bandwidth - PCIe: + avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth - HBM: + avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth - Infinity Fabric™: + avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + unit: Gbps + - Write Bandwidth - PCIe: + avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) + max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) unit: Gbps + metric_descriptions: + Atomic Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for atomic requests, divided by total duration. + Atomic Bandwidth - HBM: + plain: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. + Atomic Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. + Atomic Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. + Read - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to the accelerator's local HBM as a percent of the total active L2 cycles. + Read - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Read - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on read requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Read Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for read requests, divided by total duration. + Read Bandwidth - HBM: + plain: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. + Read Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. + Read Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. + Read Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a read request to any destination (local HBM, remote PCIe\xAE connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + Write - HBM Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to accelerator's local HBM as a percent of the total active L2 cycles. + Write - Infinity Fabric Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote Infinity Fabric connected accelerators or CPUs as a percent of the total active L2 cycles. + Write - PCIe Stall: + plain: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + rst: | + The number of cycles the L2-Fabric interface was stalled on write or atomic requests to remote PCIe connected accelerators or CPUs as a percent of the total active L2 cycles. + Write Bandwidth: + plain: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + rst: | + Total number of bytes looked up in the L2 cache for write requests, divided by total duration. + Write Bandwidth - HBM: + plain: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. + Write Bandwidth - Infinity Fabric™: + plain: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. + Write Bandwidth - PCIe: + plain: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + rst: | + Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. + Write Stall: + plain: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. + rst: | + The ratio of the total number of cycles the L2-Fabric interface was stalled on a write or atomic request to any destination (local HBM, remote accelerator or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU) over the total active L2 cycles. Deletion: [] @@ -550,14 +690,14 @@ Modification: id: 201 title: System Speed-of-Light metrics: - - MFMA IOPs (Int8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (BF16): peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + pop: | + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F64): peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | @@ -566,10 +706,10 @@ Modification: peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA FLOPs (F16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA IOPs (Int8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - Panel Config: id: 300 title: Memory Chart @@ -578,11 +718,11 @@ Modification: id: 301 title: Memory Chart metrics: + - Wavefronts: + value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - Workgroups: value: | ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - - Wavefronts: - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0) - Panel Config: id: 400 title: Roofline @@ -591,12 +731,12 @@ Modification: id: 402 title: Roofline Plot Points metrics: - - AI L1: - value: | - ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - AI HBM: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) ) + - AI L1: + value: | + ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) ) - AI L2: value: | ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) @@ -612,37 +752,37 @@ Modification: title: Workgroup manager utilizations metrics: - Dispatched Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Dispatched Workgroups: - avg: | - AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - max: | - MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) min: | MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) - - SGPR Writes: - avg: | - AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | - MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + avg: | + AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS) + - SGPR Writes: min: | MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - - Scheduler-Pipe Utilization: - avg: | - AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) max: | - MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + avg: | + AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + - Scheduler-Pipe Utilization: min: | MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - - VGPR Writes: - avg: | - AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) max: | - MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + avg: | + AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) + - VGPR Writes: min: | MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + max: | + MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) + avg: | + AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - Panel Config: id: 700 title: Wavefront @@ -652,9 +792,9 @@ Modification: title: Wavefront Launch Stats metrics: - Total Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) + avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - Panel Config: id: 1100 title: Compute Units - Compute Pipeline @@ -667,10 +807,10 @@ Modification: peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - - MFMA IOPs (INT8): - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + - MFMA FLOPs (F16): + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F64): peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) pop: | @@ -679,21 +819,21 @@ Modification: peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - - MFMA FLOPs (F16): - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + - MFMA IOPs (INT8): + peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: | - ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - metric_table: id: 1103 title: Arithmetic Operations metrics: - FLOPs (Total): - avg: | - AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - max: | - MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) min: | MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + max: | + MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) + avg: | + AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom)) - Panel Config: id: 1700 title: L2 Cache @@ -709,36 +849,36 @@ Modification: id: 1702 title: L2-Fabric interface metrics metrics: - - Remote Read Traffic: - avg: | - AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: | - MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: | - MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - Read BW: - avg: | - AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + min: | + MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) max: | MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + avg: | + AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + - Remote Read Traffic: min: | - MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) + MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + max: | + MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) + avg: | + AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - metric_table: id: 1706 title: L2 - Fabric interface detailed metrics metrics: - - Read (64B): - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - - Read (128B): - avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) + - Read (128B): + min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) + - Read (64B): + min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) + max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) + avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - Panel Config: id: 1800 title: L2 Cache (per Channel) @@ -748,14 +888,14 @@ Modification: title: L2-Fabric Read Stall (Cycles per normUnit) metrics: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) + ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom)) + ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom)) - metric_table: id: 1810 title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) metrics: - ::_1: - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom)) ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom)) + ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom)) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml index 225c5cf1494..2a5c4b21831 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml @@ -76,7 +76,7 @@ Panel Config: peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs unit: CUs peak: $cu_per_gpu @@ -232,6 +232,11 @@ Panel Config: peak: None pop: None coll_level: SQ_IFETCH_LEVEL + CU Utilization: + value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) + unit: Pct + peak: 100 + pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) metrics_description: VALU FLOPs: >- The total floating-point operations executed per second on the VALU. @@ -273,8 +278,8 @@ Panel Config: Note: this does not include any 8-bit integer operations from VALU instructions. This is also presented as a percent of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) SALU Utilization: Indicates what percent of the kernel's duration the SALU was busy executing instructions. Computed as the ratio of the total number of cycles spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles. @@ -359,3 +364,7 @@ Panel Config: of all L1I requests. L1I Fetch Latency: The average number of cycles spent to fetch instructions to a CU. + CU Utilization: The percent of total SIMD cycles in the kernel + where any SIMD on a CU was actively doing any work, summed + over all CUs. Low values (less than 100%) indicate that the accelerator was + not fully saturated by the kernel, or a potential load-imbalance issue. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml index 9d3e26ebb20..152a0bd27a5 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml @@ -33,7 +33,7 @@ Panel Config: value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) BR: value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: + Active CUs (deprecated): value: $numActiveCUs Num CUs: value: $cu_per_gpu @@ -169,8 +169,8 @@ Panel Config: GWS: Total number of GDS (global data sync) instructions issued per normalization unit. BR: Total number of BRANCH instructions issued per normalization unit. - Active CUs: Total number of active compute units (CUs) on the accelerator during - the kernel execution. + Active CUs (deprecated): Total number of active compute units (CUs) on the accelerator during + the kernel execution. (Deprecated - See CU Utilization instead) Num CUs: Total number of compute units (CUs) on the accelerator. VGPR: >- The number of architected vector general-purpose registers allocated diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json index 64e23763482..fe569bc7bf1 100644 --- a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json +++ b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json @@ -1,12 +1,12 @@ { "archs": { "gfx908": { - "delta_hash": "a2d9bef7e5d8b056605f9b1fa6569678", + "delta_hash": "ea37a8ffe846ecab3bd5833be174b1d1", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "c4878ac57b7b7b4b5711672cb2f6dffc", - "0300_memory_chart.yaml": "221c6d2bb50a4f4177585b9988f88c7b", + "0200_system_speed_of_light.yaml": "9ddb50865f89c836306ac651605f702c", + "0300_memory_chart.yaml": "51bf953860670ffb6de4b598228fb101", "0400_roofline.yaml": "bad8d851694ff9a140e29a148a35fa50", "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", @@ -24,12 +24,12 @@ } }, "gfx90a": { - "delta_hash": "55e28dda19e9ae640ba436be1a42fe97", + "delta_hash": "0c232e10c260a381e44b5c074463387b", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "dc6a6e1a8513e2d32aecc055a958c639", - "0300_memory_chart.yaml": "a61f219fe063c4c4b0b9cbaf96389a8b", + "0200_system_speed_of_light.yaml": "11ff9f634400a71b95a7a23b6e426077", + "0300_memory_chart.yaml": "d4eca43aa3bc8f07a3c2d9fbbc87839f", "0400_roofline.yaml": "da1d514ed19ca2466c167e983bdb4f13", "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", @@ -47,12 +47,12 @@ } }, "gfx940": { - "delta_hash": "531bb865bffcb2fc5658c2e613b341d2", + "delta_hash": "d7d4c9ae9917d68def0e868e925477a6", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd", - "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0200_system_speed_of_light.yaml": "d5df0a2b701972fa08ba0a44e72aa752", + "0300_memory_chart.yaml": "40dc04c73c3cec3d0a93e26d2db8c6f3", "0400_roofline.yaml": "d4650e008f2e3a7d28871e8518153575", "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", @@ -70,12 +70,12 @@ } }, "gfx941": { - "delta_hash": "9b30264f36ff99f54941346a18af016a", + "delta_hash": "723ad8f0a57153314eac933ddb184ee3", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "0ddeaefd245291c7f88674431efd74f6", - "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0200_system_speed_of_light.yaml": "0ccc1a63ebe11079832741c6d86ec3aa", + "0300_memory_chart.yaml": "40dc04c73c3cec3d0a93e26d2db8c6f3", "0400_roofline.yaml": "c066a19bc0e00e692c34998e44c62387", "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", @@ -93,12 +93,12 @@ } }, "gfx942": { - "delta_hash": "66cf66455fafa2b6b5936d31fecf3e85", + "delta_hash": "69acdfb29af82ce78f1b7051d57ae5b1", "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd", - "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7", + "0200_system_speed_of_light.yaml": "d5df0a2b701972fa08ba0a44e72aa752", + "0300_memory_chart.yaml": "40dc04c73c3cec3d0a93e26d2db8c6f3", "0400_roofline.yaml": "318c3e774d41a639628a7f72c2462375", "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531", "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a", @@ -120,8 +120,8 @@ "files": { "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f", "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8", - "0200_system_speed_of_light.yaml": "a5ee49ce96bfab87128c856c827db870", - "0300_memory_chart.yaml": "e2401641a8f280fda308f87e5ad243df", + "0200_system_speed_of_light.yaml": "bf2ca00d4b255dbbe191a7641b81dc4c", + "0300_memory_chart.yaml": "2c82fa6f81a0dda679706d36b99e7913", "0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9", "0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0", "0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670", @@ -139,4 +139,4 @@ } } } -} +} \ No newline at end of file