intel · harp-intel · Sep 22, 2025 · Sep 23, 2025 · Sep 23, 2025
@@ -86,6 +86,7 @@ To assist in evaluating the health of target systems, the `report` command can r
 | frequency | runs [avx-turbo](https://github.com/travisdowns/avx-turbo) to measure scalar and AVX frequencies across processor's cores. **Note:** Runtime increases with core count.  |
 | memory | runs [Intel(r) Memory Latency Checker](https://www.intel.com/content/www/us/en/download/736633/intel-memory-latency-checker-intel-mlc.html) (MLC) to measure memory bandwidth and latency across a load range. **Note: MLC is not included with PerfSpect.** It can be downloaded from [here](https://www.intel.com/content/www/us/en/download/736633/intel-memory-latency-checker-intel-mlc.html). Once downloaded, extract the Linux executable and place it in the perfspect/tools/x86_64 directory. |
 | numa | runs Intel(r) Memory Latency Checker(MLC) to measure bandwidth between NUMA nodes. See Note above about downloading MLC. |
+| stream | runs the [stream](https://www.cs.virginia.edu/stream/) benchmark to measure sustainable memory bandwidth and the corresponding computation rate for simple vector kernels. | **Note:** The array size is set to 4X the last level cache size or 1M elements, whichever is larger. |
 | storage | runs [fio](https://github.com/axboe/fio) for 2 minutes in read/write mode with a single worker to measure single-thread read and write bandwidth. Use the --storage-dir flag to override the default location. Minimum 5GB disk space required to run test. |
 
 #### Telemetry Command

@@ -596,5 +596,41 @@ turbostat
 Copyright (c) 2023 Intel Corporation.
  * Len Brown <[email protected]>
 -------------------------------------------------------------
+STREAM
+*-----------------------------------------------------------------------
+* Copyright 1991-2003: John D. McCalpin
+*-----------------------------------------------------------------------
+* License:
+*  1. You are free to use this program and/or to redistribute
+*     this program.
+*  2. You are free to modify this program for your own use,
+*     including commercial use, subject to the publication
+*     restrictions in item 3.
+*  3. You are free to publish results obtained from running this
+*     program, or from works that you derive from this program,
+*     with the following limitations:
+*     3a. In order to be referred to as "STREAM benchmark results",
+*         published results must be in conformance to the STREAM
+*         Run Rules, (briefly reviewed below) published at
+*         http://www.cs.virginia.edu/stream/ref.html
+*         and incorporated herein by reference.
+*         As the copyright holder, John McCalpin retains the
+*         right to determine conformity with the Run Rules.
+*     3b. Results based on modified source code or on runs not in
+*         accordance with the STREAM Run Rules must be clearly
+*         labelled whenever they are published.  Examples of
+*         proper labelling include:
+*         "tuned STREAM benchmark results" 
+*         "based on a variant of the STREAM benchmark code"
+*         Other comparable, clear and reasonable labelling is
+*         acceptable.
+*     3c. Submission of results to the STREAM benchmark web site
+*         is encouraged, but not required.
+*  4. Use of this program or creation of derived works based on this
+*     program constitutes acceptance of these licensing restrictions.
+*  5. Absolutely no warranty is expressed or implied.
+*-----------------------------------------------------------------------
+
+-------------------------------------------------------------
 
 Other names and brands may be claimed as the property of others.
@@ -135,6 +135,7 @@ var benchmarkOptions = []string{
 	"frequency",
 	"memory",
 	"numa",
+	"stream",
 	"storage",
 }
 
@@ -147,6 +148,7 @@ var benchmarkTableNames = map[string][]string{
 	"frequency":   {report.FrequencyBenchmarkTableName},
 	"memory":      {report.MemoryBenchmarkTableName},
 	"numa":        {report.NUMABenchmarkTableName},
+	"stream":      {report.StreamBenchmarkTableName},
 	"storage":     {report.StorageBenchmarkTableName},
 }
 

@@ -114,6 +114,7 @@ const (
 	FrequencyBenchmarkTableName   = "Frequency Benchmark"
 	MemoryBenchmarkTableName      = "Memory Benchmark"
 	NUMABenchmarkTableName        = "NUMA Benchmark"
+	StreamBenchmarkTableName      = "STREAM-like Benchmark"
 	StorageBenchmarkTableName     = "Storage Benchmark"
 	// telemetry table names
 	CPUUtilizationTelemetryTableName        = "CPU Utilization Telemetry"
@@ -652,6 +653,14 @@ var tableDefinitions = map[string]TableDefinition{
 		},
 		NoDataFound: "No NUMA benchmark data found. Please see the GitHub repository README for instructions on how to install Intel Memory Latency Checker (mlc).",
 		FieldsFunc:  numaBenchmarkTableValues},
+	StreamBenchmarkTableName: {
+		Name:      StreamBenchmarkTableName,
+		MenuLabel: StreamBenchmarkTableName,
+		HasRows:   false,
+		ScriptNames: []string{
+			script.StreamBenchmarkScriptName,
+		},
+		FieldsFunc: streamBenchmarkTableValues},
 	StorageBenchmarkTableName: {
 		Name:      StorageBenchmarkTableName,
 		MenuLabel: StorageBenchmarkTableName,
@@ -2303,6 +2312,16 @@ func numaBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
 	return fields
 }
 
+func streamBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
+	copy, scale, add, triad := streamFromOutput(outputs)
+	return []Field{
+		{Name: "Copy (MB/s)", Values: []string{copy}},
+		{Name: "Scale (MB/s)", Values: []string{scale}},
+		{Name: "Add (MB/s)", Values: []string{add}},
+		{Name: "Triad (MB/s)", Values: []string{triad}},
+	}
+}
+
 func storageBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
 	readBW, writeBW := storagePerfFromOutput(outputs)
 	if readBW == "" && writeBW == "" {

@@ -1924,3 +1924,92 @@ func maxRenderDepthFromOutput(outputs map[string]script.ScriptOutput) string {
 	}
 	return ""
 }
+
+// streamFromOutput parses the output from the STREAM memory bandwidth benchmark.
+// It extracts the "Best Rate MB/s" values for the Copy, Scale, Add, and Triad operations.
+// Returns empty strings if the benchmark output is missing or invalid.
+func streamFromOutput(outputs map[string]script.ScriptOutput) (copy, scale, add, triad string) {
+	// example stream output:
+	// -------------------------------------------------------------
+	// STREAM version $Revision: 5.10 $
+	// -------------------------------------------------------------
+	// This system uses 8 bytes per array element.
+	// -------------------------------------------------------------
+	// Array size = 100000000 (elements), Offset = 56 (elements)
+	// Memory per array = 762.9 MiB (= 0.7 GiB).
+	// Total memory required = 2288.8 MiB (= 2.2 GiB).
+	// Each kernel will be executed 10 times.
+	//  The *best* time for each kernel (excluding the first iteration)
+	//  will be used to compute the reported bandwidth.
+	// -------------------------------------------------------------
+	// Number of Threads requested = 4
+	// Number of Threads counted = 4
+	// -------------------------------------------------------------
+	// Your clock granularity/precision appears to be 1 microseconds.
+	// Each test below will take on the order of 49865 microseconds.
+	//    (= 49865 clock ticks)
+	// Increase the size of the arrays if this shows that
+	// you are not getting at least 20 clock ticks per test.
+	// -------------------------------------------------------------
+	// WARNING -- The above is only a rough guideline.
+	// For best results, please be sure you know the
+	// precision of your system timer.
+	// -------------------------------------------------------------
+	// Function    Best Rate MB/s  Avg time     Min time     Max time
+	// Copy:           30121.6     0.058569     0.053118     0.061632
+	// Scale:          31178.2     0.060676     0.051318     0.072578
+	// Add:            33726.8     0.076236     0.071160     0.081630
+	// Triad:          32308.4     0.078381     0.074284     0.089322
+	// -------------------------------------------------------------
+	// Solution Validates: avg error less than 1.000000e-13 on all three arrays
+	// -------------------------------------------------------------
+
+	// parse Best Rate for copy, scale, add and triad from stream output
+	streamOutput := outputs[script.StreamBenchmarkScriptName]
+
+	if streamOutput.Stdout == "" || streamOutput.Exitcode != 0 {
+		slog.Warn("Stream benchmark output is empty or exited with error",
+			"exitcode", streamOutput.Exitcode,
+			"stdout", streamOutput.Stdout,
+			"stderr", streamOutput.Stderr)
+		return "", "", "", ""
+	}
+
+	// Regular expressions to extract Best Rate MB/s values for each function
+	// The pattern matches the line format from STREAM benchmark output:
+	// Function    Best Rate MB/s  Avg time     Min time     Max time
+	// Copy:           30121.6     0.058569     0.053118     0.061632
+	copyRe := regexp.MustCompile(`Copy:\s+(\d+\.\d+)`)
+	scaleRe := regexp.MustCompile(`Scale:\s+(\d+\.\d+)`)
+	addRe := regexp.MustCompile(`Add:\s+(\d+\.\d+)`)
+	triadRe := regexp.MustCompile(`Triad:\s+(\d+\.\d+)`)
+
+	// Extract the values
+	copyMatch := copyRe.FindStringSubmatch(streamOutput.Stdout)
+	scaleMatch := scaleRe.FindStringSubmatch(streamOutput.Stdout)
+	addMatch := addRe.FindStringSubmatch(streamOutput.Stdout)
+	triadMatch := triadRe.FindStringSubmatch(streamOutput.Stdout)
+
+	// Check matches and assign values
+	if len(copyMatch) > 1 {
+		copy = copyMatch[1]
+	}
+	if len(scaleMatch) > 1 {
+		scale = scaleMatch[1]
+	}
+	if len(addMatch) > 1 {
+		add = addMatch[1]
+	}
+	if len(triadMatch) > 1 {
+		triad = triadMatch[1]
+	}
+
+	// Log the results
+	slog.Debug("Stream benchmark results parsed",
+		"copy", copy,
+		"scale", scale,
+		"add", add,
+		"triad", triad)
+
+	return copy, scale, add, triad
+}
@@ -102,6 +102,7 @@ const (
 	// benchmark scripts
 	MemoryBenchmarkScriptName    = "memory benchmark"
 	NumaBenchmarkScriptName      = "numa benchmark"
+	StreamBenchmarkScriptName    = "stream benchmark"
 	SpeedBenchmarkScriptName     = "speed benchmark"
 	FrequencyBenchmarkScriptName = "frequency benchmark"
 	PowerBenchmarkScriptName     = "power benchmark"
@@ -996,6 +997,162 @@ echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
 		Depends:       []string{"mlc"},
 		Sequential:    true,
 	},
+	StreamBenchmarkScriptName: {
+		Name: StreamBenchmarkScriptName,
+		ScriptTemplate: `
+# The general rule for STREAM is that each array must be at least 4x the size of the sum of all 
+# the last-level caches used in the run, or 1 Million elements -- whichever is larger.
+#
+# Get the total LLC (Last Level Cache) size from lscpu
+# Example: "L3 cache:                             672 MiB (2 instances)"
+llc_info=$(lscpu | grep "L3 cache")
+llc_detected=true
+
+if [[ -z "$llc_info" ]]; then
+    echo "Error: L3 cache information not found. Cannot proceed with STREAM benchmark."
+    llc_detected=false
+    llc_size_bytes=0
+else
+    # Extract cache size value and unit
+    llc_size=$(echo "$llc_info" | awk '{print $3}')
+    llc_unit=$(echo "$llc_info" | awk '{print $4}')
+
+    # Extract number of instances, default to 1 if not found
+    if [[ "$llc_info" =~ \(([0-9]+)[[:space:]]*instance ]]; then
+        instances=${BASH_REMATCH[1]}
+    else
+        instances=1
+        echo "Warning: Could not determine number of cache instances. Assuming 1 instance."
+    fi
+
+    # Convert to bytes based on unit
+    case "$llc_unit" in
+        "KiB")
+            llc_size_bytes=$((llc_size * 1024 * instances))
+            ;;
+        "MiB")
+            llc_size_bytes=$((llc_size * 1024 * 1024 * instances))
+            ;;
+        "GiB")
+            llc_size_bytes=$((llc_size * 1024 * 1024 * 1024 * instances))
+            ;;
+        *)
+            echo "Warning: Unknown cache size unit: $llc_unit. Defaulting to KiB."
+            llc_size_bytes=$((llc_size * 1024 * instances))
+            ;;
+    esac
+fi
+
+# Check if we have a valid LLC size
+if [[ $llc_size_bytes -le 0 ]]; then
+    echo "Warning: Invalid L3 cache size detected: $llc_size_bytes bytes"
+    llc_detected=false
+    # Set a reasonable default - 1 million elements
+    min_array_elements=1000000
+else
+    echo "Total L3 cache size: $((llc_size_bytes / 1024 / 1024)) MiB across $instances instance(s)"
+    min_array_size=$((4 * llc_size_bytes))
+    min_array_elements=$((min_array_size / 8)) # Assuming double precision (8 bytes per element)
+fi
+# Ensure we meet the minimum requirement of 1 million elements
+if [[ $min_array_elements -lt 1000000 ]]; then
+    echo "Minimum array size (1 million elements) exceeds 4x L3 cache size"
+    min_array_elements=1000000
+else
+    echo "Using array size of $min_array_elements elements (4x L3 cache size)"
+    echo "This equals $((min_array_elements * 8 / 1024 / 1024)) MiB per array, $((3 * min_array_elements * 8 / 1024 / 1024)) MiB total"
+fi
+
+# Log cache detection status for clarity
+if [[ "$llc_detected" == "true" ]]; then
+    echo "LLC detection: Successful"
+else
+    echo "LLC detection: Failed"
+fi
+
+# Set array size based on the detected LLC size
+array_size=$min_array_elements
+
+# Exit if LLC wasn't detected
+if [[ "$llc_detected" == "false" ]]; then
+    echo "Cannot run STREAM benchmark without LLC information."
+    exit 1
+fi
+thp_enabled=$( cat /sys/kernel/mm/transparent_hugepage/enabled | grep -o '\[.*\]' | tr -d '[]' )
+if [ "$thp_enabled" != "always" ]; then
+	echo "Transparent Hugepages (THP) are not enabled. Enabling THP for the benchmark."
+	echo always > /sys/kernel/mm/transparent_hugepage/enabled
+fi
+thp_defrag=$( cat /sys/kernel/mm/transparent_hugepage/defrag | grep -o '\[.*\]' | tr -d '[]' )
+if [ "$thp_defrag" != "always" ]; then
+	echo "Transparent Hugepages (THP) defrag is not set to 'always'. Setting it to 'always' for the benchmark."
+	echo always > /sys/kernel/mm/transparent_hugepage/defrag
+fi
+# Clear caches and compact memory before running the benchmark
+echo 3 > /proc/sys/vm/drop_caches
+echo 1 > /proc/sys/vm/compact_memory
+
+# Get the number of physical cores from lscpu
+cores_per_socket=$(lscpu | grep "Core(s) per socket" | awk '{print $4}')
+if [[ -z "$cores_per_socket" ]]; then
+    echo "Error: Failed to determine cores per socket from lscpu output."
+    exit 1
+fi
+
+sockets=$(lscpu | grep "Socket(s)" | awk '{print $2}')
+if [[ -z "$sockets" ]]; then
+    echo "Error: Failed to determine number of sockets from lscpu output."
+    exit 1
+fi
+
+echo "Detected $cores_per_socket cores per socket across $sockets socket(s)"
+num_physical_cores=$((cores_per_socket * sockets))
+
+# Exit with error if we couldn't determine the core count
+if [[ -z "$num_physical_cores" || "$num_physical_cores" -le 0 ]]; then
+    echo "Error: Could not determine the number of physical cores."
+    echo "STREAM benchmark requires knowing the number of physical cores for proper thread configuration."
+    exit 1
+fi
+
+echo "Using $num_physical_cores physical cores for STREAM benchmark"
+export OMP_NUM_THREADS=$num_physical_cores
+
+# Set KMP settings for optimal performance based on SMT enabled/disabled
+# https://www.intel.com/content/www/us/en/developer/articles/technical/optimizing-memory-bandwidth-on-stream-triad.html
+# If SMT is enabled, use 'granularity=fine,compact,1,0'
+# If SMT is disabled, use 'compact'
+smt_enabled=$(lscpu | grep "Thread(s) per core" | awk '{print $4}')
+if [[ -z "$smt_enabled" ]]; then
+	echo "Error: Failed to determine if SMT is enabled from lscpu output."
+	exit 1
+fi
+if [[ "$smt_enabled" -gt 1 ]]; then
+	echo "SMT is enabled with $smt_enabled threads per core. Setting KMP_AFFINITY to 'granularity=fine,compact,1,0'"
+	export KMP_AFFINITY=granularity=fine,compact,1,0
+else
+	echo "SMT is disabled. Setting KMP_AFFINITY to 'compact'"
+	export KMP_AFFINITY=compact
+fi
+
+echo "Running STREAM benchmark with array size of $array_size elements"
+echo "($(($array_size * 8 / 1024 / 1024)) MiB per array, $((3 * $array_size * 8 / 1024 / 1024)) MiB total)"
+
+# Run the STREAM benchmark with the calculated optimal array size
+./stream -s $array_size -n 100
+
+# Restore original THP settings
+if [ "$thp_enabled" != "always" ]; then
+	echo "$thp_enabled" > /sys/kernel/mm/transparent_hugepage/enabled
+fi
+if [ "$thp_defrag" != "always" ]; then
+	echo "$thp_defrag" > /sys/kernel/mm/transparent_hugepage/defrag
+fi
+`,
+		Superuser:  true,
+		Depends:    []string{"stream"},
+		Sequential: true,
+	},
 	SpeedBenchmarkScriptName: {
 		Name: SpeedBenchmarkScriptName,
 		ScriptTemplate: `methods=$( stress-ng --cpu 1 --cpu-method x 2>&1 | cut -d":" -f2 | cut -c 6- )