Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ To assist in evaluating the health of target systems, the `report` command can r
| frequency | runs [avx-turbo](https://github.com/travisdowns/avx-turbo) to measure scalar and AVX frequencies across processor's cores. **Note:** Runtime increases with core count. |
| memory | runs [Intel(r) Memory Latency Checker](https://www.intel.com/content/www/us/en/download/736633/intel-memory-latency-checker-intel-mlc.html) (MLC) to measure memory bandwidth and latency across a load range. **Note: MLC is not included with PerfSpect.** It can be downloaded from [here](https://www.intel.com/content/www/us/en/download/736633/intel-memory-latency-checker-intel-mlc.html). Once downloaded, extract the Linux executable and place it in the perfspect/tools/x86_64 directory. |
| numa | runs Intel(r) Memory Latency Checker(MLC) to measure bandwidth between NUMA nodes. See Note above about downloading MLC. |
| stream | runs the [stream](https://www.cs.virginia.edu/stream/) benchmark to measure sustainable memory bandwidth and the corresponding computation rate for simple vector kernels. | **Note:** The array size is set to 4X the last level cache size or 1M elements, whichever is larger. |
| storage | runs [fio](https://github.com/axboe/fio) for 2 minutes in read/write mode with a single worker to measure single-thread read and write bandwidth. Use the --storage-dir flag to override the default location. Minimum 5GB disk space required to run test. |

#### Telemetry Command
Expand Down
36 changes: 36 additions & 0 deletions THIRD_PARTY_PROGRAMS
Original file line number Diff line number Diff line change
Expand Up @@ -596,5 +596,41 @@ turbostat
Copyright (c) 2023 Intel Corporation.
* Len Brown <[email protected]>
-------------------------------------------------------------
STREAM
*-----------------------------------------------------------------------
* Copyright 1991-2003: John D. McCalpin
*-----------------------------------------------------------------------
* License:
* 1. You are free to use this program and/or to redistribute
* this program.
* 2. You are free to modify this program for your own use,
* including commercial use, subject to the publication
* restrictions in item 3.
* 3. You are free to publish results obtained from running this
* program, or from works that you derive from this program,
* with the following limitations:
* 3a. In order to be referred to as "STREAM benchmark results",
* published results must be in conformance to the STREAM
* Run Rules, (briefly reviewed below) published at
* http://www.cs.virginia.edu/stream/ref.html
* and incorporated herein by reference.
* As the copyright holder, John McCalpin retains the
* right to determine conformity with the Run Rules.
* 3b. Results based on modified source code or on runs not in
* accordance with the STREAM Run Rules must be clearly
* labelled whenever they are published. Examples of
* proper labelling include:
* "tuned STREAM benchmark results"
* "based on a variant of the STREAM benchmark code"
* Other comparable, clear and reasonable labelling is
* acceptable.
* 3c. Submission of results to the STREAM benchmark web site
* is encouraged, but not required.
* 4. Use of this program or creation of derived works based on this
* program constitutes acceptance of these licensing restrictions.
* 5. Absolutely no warranty is expressed or implied.
*-----------------------------------------------------------------------

-------------------------------------------------------------

Other names and brands may be claimed as the property of others.
2 changes: 2 additions & 0 deletions cmd/report/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ var benchmarkOptions = []string{
"frequency",
"memory",
"numa",
"stream",
"storage",
}

Expand All @@ -147,6 +148,7 @@ var benchmarkTableNames = map[string][]string{
"frequency": {report.FrequencyBenchmarkTableName},
"memory": {report.MemoryBenchmarkTableName},
"numa": {report.NUMABenchmarkTableName},
"stream": {report.StreamBenchmarkTableName},
"storage": {report.StorageBenchmarkTableName},
}

Expand Down
19 changes: 19 additions & 0 deletions internal/report/table_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ const (
FrequencyBenchmarkTableName = "Frequency Benchmark"
MemoryBenchmarkTableName = "Memory Benchmark"
NUMABenchmarkTableName = "NUMA Benchmark"
StreamBenchmarkTableName = "STREAM-like Benchmark"
StorageBenchmarkTableName = "Storage Benchmark"
// telemetry table names
CPUUtilizationTelemetryTableName = "CPU Utilization Telemetry"
Expand Down Expand Up @@ -652,6 +653,14 @@ var tableDefinitions = map[string]TableDefinition{
},
NoDataFound: "No NUMA benchmark data found. Please see the GitHub repository README for instructions on how to install Intel Memory Latency Checker (mlc).",
FieldsFunc: numaBenchmarkTableValues},
StreamBenchmarkTableName: {
Name: StreamBenchmarkTableName,
MenuLabel: StreamBenchmarkTableName,
HasRows: false,
ScriptNames: []string{
script.StreamBenchmarkScriptName,
},
FieldsFunc: streamBenchmarkTableValues},
StorageBenchmarkTableName: {
Name: StorageBenchmarkTableName,
MenuLabel: StorageBenchmarkTableName,
Expand Down Expand Up @@ -2303,6 +2312,16 @@ func numaBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
return fields
}

func streamBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
copy, scale, add, triad := streamFromOutput(outputs)
return []Field{
{Name: "Copy (MB/s)", Values: []string{copy}},
{Name: "Scale (MB/s)", Values: []string{scale}},
{Name: "Add (MB/s)", Values: []string{add}},
{Name: "Triad (MB/s)", Values: []string{triad}},
}
}

func storageBenchmarkTableValues(outputs map[string]script.ScriptOutput) []Field {
readBW, writeBW := storagePerfFromOutput(outputs)
if readBW == "" && writeBW == "" {
Expand Down
89 changes: 89 additions & 0 deletions internal/report/table_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -1924,3 +1924,92 @@ func maxRenderDepthFromOutput(outputs map[string]script.ScriptOutput) string {
}
return ""
}

// streamFromOutput parses the output from the STREAM memory bandwidth benchmark.
// It extracts the "Best Rate MB/s" values for the Copy, Scale, Add, and Triad operations.
// Returns empty strings if the benchmark output is missing or invalid.
func streamFromOutput(outputs map[string]script.ScriptOutput) (copy, scale, add, triad string) {
// example stream output:
// -------------------------------------------------------------
// STREAM version $Revision: 5.10 $
// -------------------------------------------------------------
// This system uses 8 bytes per array element.
// -------------------------------------------------------------
// Array size = 100000000 (elements), Offset = 56 (elements)
// Memory per array = 762.9 MiB (= 0.7 GiB).
// Total memory required = 2288.8 MiB (= 2.2 GiB).
// Each kernel will be executed 10 times.
// The *best* time for each kernel (excluding the first iteration)
// will be used to compute the reported bandwidth.
// -------------------------------------------------------------
// Number of Threads requested = 4
// Number of Threads counted = 4
// -------------------------------------------------------------
// Your clock granularity/precision appears to be 1 microseconds.
// Each test below will take on the order of 49865 microseconds.
// (= 49865 clock ticks)
// Increase the size of the arrays if this shows that
// you are not getting at least 20 clock ticks per test.
// -------------------------------------------------------------
// WARNING -- The above is only a rough guideline.
// For best results, please be sure you know the
// precision of your system timer.
// -------------------------------------------------------------
// Function Best Rate MB/s Avg time Min time Max time
// Copy: 30121.6 0.058569 0.053118 0.061632
// Scale: 31178.2 0.060676 0.051318 0.072578
// Add: 33726.8 0.076236 0.071160 0.081630
// Triad: 32308.4 0.078381 0.074284 0.089322
// -------------------------------------------------------------
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
// -------------------------------------------------------------

// parse Best Rate for copy, scale, add and triad from stream output
streamOutput := outputs[script.StreamBenchmarkScriptName]

if streamOutput.Stdout == "" || streamOutput.Exitcode != 0 {
slog.Warn("Stream benchmark output is empty or exited with error",
"exitcode", streamOutput.Exitcode,
"stdout", streamOutput.Stdout,
"stderr", streamOutput.Stderr)
return "", "", "", ""
}

// Regular expressions to extract Best Rate MB/s values for each function
// The pattern matches the line format from STREAM benchmark output:
// Function Best Rate MB/s Avg time Min time Max time
// Copy: 30121.6 0.058569 0.053118 0.061632
copyRe := regexp.MustCompile(`Copy:\s+(\d+\.\d+)`)
scaleRe := regexp.MustCompile(`Scale:\s+(\d+\.\d+)`)
addRe := regexp.MustCompile(`Add:\s+(\d+\.\d+)`)
triadRe := regexp.MustCompile(`Triad:\s+(\d+\.\d+)`)

// Extract the values
copyMatch := copyRe.FindStringSubmatch(streamOutput.Stdout)
scaleMatch := scaleRe.FindStringSubmatch(streamOutput.Stdout)
addMatch := addRe.FindStringSubmatch(streamOutput.Stdout)
triadMatch := triadRe.FindStringSubmatch(streamOutput.Stdout)

// Check matches and assign values
if len(copyMatch) > 1 {
copy = copyMatch[1]
}
if len(scaleMatch) > 1 {
scale = scaleMatch[1]
}
if len(addMatch) > 1 {
add = addMatch[1]
}
if len(triadMatch) > 1 {
triad = triadMatch[1]
}

// Log the results
slog.Debug("Stream benchmark results parsed",
"copy", copy,
"scale", scale,
"add", add,
"triad", triad)

return copy, scale, add, triad
}
157 changes: 157 additions & 0 deletions internal/script/script_defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ const (
// benchmark scripts
MemoryBenchmarkScriptName = "memory benchmark"
NumaBenchmarkScriptName = "numa benchmark"
StreamBenchmarkScriptName = "stream benchmark"
SpeedBenchmarkScriptName = "speed benchmark"
FrequencyBenchmarkScriptName = "frequency benchmark"
PowerBenchmarkScriptName = "power benchmark"
Expand Down Expand Up @@ -996,6 +997,162 @@ echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
Depends: []string{"mlc"},
Sequential: true,
},
StreamBenchmarkScriptName: {
Name: StreamBenchmarkScriptName,
ScriptTemplate: `
# The general rule for STREAM is that each array must be at least 4x the size of the sum of all
# the last-level caches used in the run, or 1 Million elements -- whichever is larger.
#
# Get the total LLC (Last Level Cache) size from lscpu
# Example: "L3 cache: 672 MiB (2 instances)"
llc_info=$(lscpu | grep "L3 cache")
llc_detected=true

if [[ -z "$llc_info" ]]; then
echo "Error: L3 cache information not found. Cannot proceed with STREAM benchmark."
llc_detected=false
llc_size_bytes=0
else
# Extract cache size value and unit
llc_size=$(echo "$llc_info" | awk '{print $3}')
llc_unit=$(echo "$llc_info" | awk '{print $4}')

# Extract number of instances, default to 1 if not found
if [[ "$llc_info" =~ \(([0-9]+)[[:space:]]*instance ]]; then
instances=${BASH_REMATCH[1]}
else
instances=1
echo "Warning: Could not determine number of cache instances. Assuming 1 instance."
fi

# Convert to bytes based on unit
case "$llc_unit" in
"KiB")
llc_size_bytes=$((llc_size * 1024 * instances))
;;
"MiB")
llc_size_bytes=$((llc_size * 1024 * 1024 * instances))
;;
"GiB")
llc_size_bytes=$((llc_size * 1024 * 1024 * 1024 * instances))
;;
*)
echo "Warning: Unknown cache size unit: $llc_unit. Defaulting to KiB."
llc_size_bytes=$((llc_size * 1024 * instances))
;;
esac
fi

# Check if we have a valid LLC size
if [[ $llc_size_bytes -le 0 ]]; then
echo "Warning: Invalid L3 cache size detected: $llc_size_bytes bytes"
llc_detected=false
# Set a reasonable default - 1 million elements
min_array_elements=1000000
else
echo "Total L3 cache size: $((llc_size_bytes / 1024 / 1024)) MiB across $instances instance(s)"
min_array_size=$((4 * llc_size_bytes))
min_array_elements=$((min_array_size / 8)) # Assuming double precision (8 bytes per element)
fi
# Ensure we meet the minimum requirement of 1 million elements
if [[ $min_array_elements -lt 1000000 ]]; then
echo "Minimum array size (1 million elements) exceeds 4x L3 cache size"
min_array_elements=1000000
else
echo "Using array size of $min_array_elements elements (4x L3 cache size)"
echo "This equals $((min_array_elements * 8 / 1024 / 1024)) MiB per array, $((3 * min_array_elements * 8 / 1024 / 1024)) MiB total"
fi

# Log cache detection status for clarity
if [[ "$llc_detected" == "true" ]]; then
echo "LLC detection: Successful"
else
echo "LLC detection: Failed"
fi

# Set array size based on the detected LLC size
array_size=$min_array_elements

# Exit if LLC wasn't detected
if [[ "$llc_detected" == "false" ]]; then
echo "Cannot run STREAM benchmark without LLC information."
exit 1
fi
thp_enabled=$( cat /sys/kernel/mm/transparent_hugepage/enabled | grep -o '\[.*\]' | tr -d '[]' )
if [ "$thp_enabled" != "always" ]; then
echo "Transparent Hugepages (THP) are not enabled. Enabling THP for the benchmark."
echo always > /sys/kernel/mm/transparent_hugepage/enabled
fi
thp_defrag=$( cat /sys/kernel/mm/transparent_hugepage/defrag | grep -o '\[.*\]' | tr -d '[]' )
if [ "$thp_defrag" != "always" ]; then
echo "Transparent Hugepages (THP) defrag is not set to 'always'. Setting it to 'always' for the benchmark."
echo always > /sys/kernel/mm/transparent_hugepage/defrag
fi
# Clear caches and compact memory before running the benchmark
echo 3 > /proc/sys/vm/drop_caches
echo 1 > /proc/sys/vm/compact_memory

# Get the number of physical cores from lscpu
cores_per_socket=$(lscpu | grep "Core(s) per socket" | awk '{print $4}')
if [[ -z "$cores_per_socket" ]]; then
echo "Error: Failed to determine cores per socket from lscpu output."
exit 1
fi

sockets=$(lscpu | grep "Socket(s)" | awk '{print $2}')
if [[ -z "$sockets" ]]; then
echo "Error: Failed to determine number of sockets from lscpu output."
exit 1
fi

echo "Detected $cores_per_socket cores per socket across $sockets socket(s)"
num_physical_cores=$((cores_per_socket * sockets))

# Exit with error if we couldn't determine the core count
if [[ -z "$num_physical_cores" || "$num_physical_cores" -le 0 ]]; then
echo "Error: Could not determine the number of physical cores."
echo "STREAM benchmark requires knowing the number of physical cores for proper thread configuration."
exit 1
fi

echo "Using $num_physical_cores physical cores for STREAM benchmark"
export OMP_NUM_THREADS=$num_physical_cores

# Set KMP settings for optimal performance based on SMT enabled/disabled
# https://www.intel.com/content/www/us/en/developer/articles/technical/optimizing-memory-bandwidth-on-stream-triad.html
# If SMT is enabled, use 'granularity=fine,compact,1,0'
# If SMT is disabled, use 'compact'
smt_enabled=$(lscpu | grep "Thread(s) per core" | awk '{print $4}')
if [[ -z "$smt_enabled" ]]; then
echo "Error: Failed to determine if SMT is enabled from lscpu output."
exit 1
fi
if [[ "$smt_enabled" -gt 1 ]]; then
echo "SMT is enabled with $smt_enabled threads per core. Setting KMP_AFFINITY to 'granularity=fine,compact,1,0'"
export KMP_AFFINITY=granularity=fine,compact,1,0
else
echo "SMT is disabled. Setting KMP_AFFINITY to 'compact'"
export KMP_AFFINITY=compact
fi

echo "Running STREAM benchmark with array size of $array_size elements"
echo "($(($array_size * 8 / 1024 / 1024)) MiB per array, $((3 * $array_size * 8 / 1024 / 1024)) MiB total)"

# Run the STREAM benchmark with the calculated optimal array size
./stream -s $array_size -n 100

# Restore original THP settings
if [ "$thp_enabled" != "always" ]; then
echo "$thp_enabled" > /sys/kernel/mm/transparent_hugepage/enabled
fi
if [ "$thp_defrag" != "always" ]; then
echo "$thp_defrag" > /sys/kernel/mm/transparent_hugepage/defrag
fi
`,
Superuser: true,
Depends: []string{"stream"},
Sequential: true,
},
SpeedBenchmarkScriptName: {
Name: SpeedBenchmarkScriptName,
ScriptTemplate: `methods=$( stress-ng --cpu 1 --cpu-method x 2>&1 | cut -d":" -f2 | cut -c 6- )
Expand Down
Loading