diff --git a/cvs/lib/docker_lib.py b/cvs/lib/docker_lib.py index 1906ed0..837cb79 100644 --- a/cvs/lib/docker_lib.py +++ b/cvs/lib/docker_lib.py @@ -16,6 +16,7 @@ from cvs.lib.verify_lib import * log = globals.log +from cvs.lib.linux_utils import detect_distro @@ -143,3 +144,43 @@ def launch_docker_container( phdl, container_name, image, device_list=[], volume + +def install_docker_on_rhel(phdl): + """Install Docker on RHEL/CentOS/Fedora""" + ''' + docker ce comes distributed from Docker Inc repo for centos for rhel/alma and centos. + hence we need to add the repo and then install via dnf, as there will be failures via default + ''' + cmds = ['sudo dnf -y install dnf-plugins-core', + 'sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo', + 'sudo dnf -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin', + 'sudo systemctl start docker', + 'sudo systemctl enable docker'] + for cmd in cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + time.sleep(3) + +def install_docker_on_suse(phdl): + """Install Docker on SLES""" + cmds = ['sudo zypper refresh', 'sudo zypper -n install docker', + 'sudo systemctl start docker', 'sudo systemctl enable docker'] + for cmd in cmds: + out_dict = phdl.exec(cmd) + for node in out_dict.keys(): + if re.search( 'error|fail', out_dict[node], re.I ): + fail_test(f'Failed to execute "{cmd}" on node {node}, please check logs') + time.sleep(3) + +def install_docker(phdl): + """Install Docker using appropriate method for detected distro""" + distro = detect_distro(phdl) + if distro == 'debian': + install_docker_on_ubuntu(phdl) + elif distro == 'rhel': + install_docker_on_rhel(phdl) + elif distro == 'suse': + install_docker_on_suse(phdl) + diff --git a/cvs/lib/linux_utils.py b/cvs/lib/linux_utils.py index a0e8c1b..ffe361c 100644 --- a/cvs/lib/linux_utils.py +++ b/cvs/lib/linux_utils.py @@ -856,3 +856,182 @@ def get_gpu_numa_dict( phdl ): print(gpu_numa_dict) return gpu_numa_dict + """Detect Linux distro and return distro type""" + out_dict = phdl.exec('cat /etc/os-release') + for node, output in out_dict.items(): + if re.search('ubuntu|debian', output, re.I): + return 'debian' + elif re.search('rhel|centos|fedora|rocky|alma', output, re.I): + return 'rhel' + elif re.search('sles|suse', output, re.I): + return 'suse' + return 'debian' # Default fallback + +def get_package_manager_cmd(distro, action='install'): + #based on distro find correct cmds + commands = { + 'debian': { + 'update': 'sudo apt-get update -y', + 'install': 'sudo apt-get install -y', + 'remove': 'sudo apt-get remove -y', + }, + 'rhel': { + 'update': 'sudo dnf check-update || true', + 'install': 'sudo dnf install -y', + 'remove': 'sudo dnf remove -y', + }, + 'suse': { + 'update': 'sudo zypper refresh', + 'install': 'sudo zypper install -y', + 'remove': 'sudo zypper remove -y', + } + } + return commands.get(distro, commands['debian']).get(action, '') + +def translate_package_name(package, distro): + #some pkgs differ in naming across distro. use right ones + if distro == 'debian': + return package + + # Package name mappings for non-Debian distros + package_map = { + 'rhel': { + 'libgtest-dev': 'gtest-devel', + 'libpci-dev': 'pciutils-devel', + 'libpci3': 'pciutils', + 'libyaml-cpp-dev': 'yaml-cpp-devel', + 'libibverbs-dev': 'libibverbs-devel', + 'librdmacm-dev': 'librdmacm-devel', + 'libibumad-dev': 'libibumad-devel', + 'openmpi-bin': 'openmpi', + 'openmpi-common': 'openmpi', + 'libopenmpi-dev': 'openmpi-devel', + 'hipblaslt-dev': 'hipblaslt-devel', + 'ibverbs-providers': 'rdma-core', + 'build-essential': ['gcc', 'gcc-c++', 'make'], # Install separately as multiple packages + 'apt-transport-https': None, # Not needed + 'software-properties-common': None, # Not needed + }, + 'suse': { + 'libgtest-dev': 'gtest', + 'libpci-dev': 'pciutils-devel', + 'libpci3': 'pciutils', + 'libyaml-cpp-dev': 'libyaml-cpp-devel', + 'openmpi-bin': 'openmpi', + 'libopenmpi-dev': 'openmpi-devel', + 'build-essential': ['gcc', 'gcc-c++', 'make'] + } + } + + map_dict = package_map.get(distro, {}) + return map_dict.get(package, package) + +def map_packages(distro, packages): + #update package names and flatten if needed + result = [] + for pkg in packages: + translated = translate_package_name(pkg, distro) + + if translated is None: + # Skip packages not needed on this distro + continue + elif isinstance(translated, list): + # Package expands to multiple packages (e.g., build-essential) + result.extend(translated) + else: + # Single package + result.append(translated) + + return result + +def install_package(hdl, package, distro=None, timeout=200): + # Install a package using the appropriate package manager for the detected distro. + if distro is None: + distro = detect_distro(hdl) + + # Translate package name for the distro + translated_pkg = translate_package_name(package, distro) + + # Skip if package is not needed on this distro + if translated_pkg is None: + log.info(f'Package {package} not needed on {distro}, skipping') + return {} + + # Get the install command + install_cmd = get_package_manager_cmd(distro, 'install') + + # Execute and return out_dict for error checking + out_dict = hdl.exec(f'{install_cmd} {translated_pkg}', timeout=timeout) + return out_dict + +def update_package_cache(hdl, distro=None, timeout=600): + """ + Update package manager cache and return out_dict for error checking. + + Returns: + out_dict: Command output dictionary for error checking + """ + if distro is None: + distro = detect_distro(hdl) + + update_cmd = get_package_manager_cmd(distro, 'update') + out_dict = hdl.exec(update_cmd, timeout=timeout) + return out_dict + +def install_build_tools(hdl, distro, timeout=200): + """Install build tools appropriate for the distro""" + if distro == 'debian': + return install_package(hdl, 'build-essential', distro, timeout) + elif distro in ['rhel', 'suse']: + results = {} + for pkg in ['gcc', 'gcc-c++', 'make']: + out_dict = install_package(hdl, pkg, distro, timeout) + results.update(out_dict) + return results + +#linux distro discovery and package management +def detect_distro(phdl): + """Detect Linux distro and return distro type""" + out_dict = phdl.exec('cat /etc/os-release') + for node, output in out_dict.items(): + if re.search('ubuntu|debian', output, re.I): + return 'debian' + elif re.search('rhel|centos|fedora|rocky|alma', output, re.I): + return 'rhel' + elif re.search('sles|suse', output, re.I): + return 'suse' + return 'debian' # Default fallback + """Detect Linux distro and return distro type""" + out_dict = phdl.exec('cat /etc/os-release') + for node, output in out_dict.items(): + if re.search('ubuntu|debian', output, re.I): + return 'debian' + elif re.search('rhel|centos|fedora|rocky|alma', output, re.I): + return 'rhel' + elif re.search('sles|suse', output, re.I): + return 'suse' + return 'debian' # Default fallback + +def get_package_manager_cmd(distro, action='install'): + #based on distro find correct cmds + commands = { + 'debian': { + 'update': 'sudo apt-get update -y', + 'install': 'sudo apt-get install -y', + 'remove': 'sudo apt-get remove -y', + }, + 'rhel': { + 'update': 'sudo dnf check-update || true', + 'install': 'sudo dnf install -y', + 'remove': 'sudo dnf remove -y', + }, + 'suse': { + 'update': 'sudo zypper refresh', + 'install': 'sudo zypper install -y', + 'remove': 'sudo zypper remove -y', + } + } + return commands.get(distro, commands['debian']).get(action, '') + + + diff --git a/cvs/tests/health/babelstream_cvs.py b/cvs/tests/health/babelstream_cvs.py deleted file mode 100644 index 7cdcfda..0000000 --- a/cvs/tests/health/babelstream_cvs.py +++ /dev/null @@ -1,300 +0,0 @@ -''' -Copyright 2025 Advanced Micro Devices, Inc. -All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. -The year included in the foregoing notice is the year of creation of the work. -All code contained here is Property of Advanced Micro Devices, Inc. -''' - -import pytest - -import re -import sys -import os -import sys -import time -import json -import logging -import time - - -from cvs.lib.parallel_ssh_lib import * -from cvs.lib.utils_lib import * - -from cvs.lib import globals - -log = globals.log - - -# Importing additional cmd line args to script .. -@pytest.fixture(scope="module") -def cluster_file(pytestconfig): - """ - Retrieve the --cluster_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the cluster configuration JSON file. - """ - return pytestconfig.getoption("cluster_file") - - -@pytest.fixture(scope="module") -def config_file(pytestconfig): - """ - Retrieve the --config_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the test configuration JSON file. - - Notes: - - Ensure your pytest invocation includes: --config_file=/path/to/config.json - - Module scope ensures this runs once per module to avoid repeated lookups. - """ - return pytestconfig.getoption("config_file") - - - - - -@pytest.fixture(scope="module") -def cluster_dict(cluster_file): - """ - Load and return the entire cluster configuration from JSON. - - Args: - cluster_file (str): Path to the cluster JSON file. - - Returns: - dict: Parsed cluster configuration (nodes, credentials, etc.). - """ - with open(cluster_file) as json_file: - cluster_dict = json.load(json_file) - - # Resolve path placeholders like {user-id} in cluster config - cluster_dict = resolve_cluster_config_placeholders(cluster_dict) - - log.info(cluster_dict) - return cluster_dict - - - - - -@pytest.fixture(scope="module") -def config_dict(config_file, cluster_dict): - """ - Load and return the 'babelstream' subsection from the test configuration JSON. - - Args: - config_file (str): Path to the test configuration JSON. - - Returns: - dict: The 'babelstream' configuration block, expected to include: - - path: location where hip-stream (BabelStream HIP binary) will live - - git_install_path: directory to clone and build BabelStream - - git_url: BabelStream repository URL - - results: expected performance thresholds for kernels (copy/add/mul/triad/dot) - """ - with open(config_file) as json_file: - config_dict_t = json.load(json_file) - config_dict = config_dict_t['babelstream'] - - # Resolve path placeholders like {user-id}, {home-mount-dir}, etc. - config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) - - log.info(config_dict) - return config_dict - - - -def parse_babelstream_results( out_dict, exp_dict ): - """ - Parse BabelStream outputs per node and validate kernel bandwidths vs expected thresholds. - - Args: - out_dict (dict[str, str]): Mapping: node -> the full stdout/stderr of BabelStream runs. - exp_dict (dict): Expected thresholds like: - { - "copy": "", "add": "", "mul": "", - "triad": "", "dot": "" - } - - Behavior: - - Uses regex to extract measured GB/s for kernels (Copy, Add, Mul, Triad, Dot). - - For each occurrence (multiple ranks), compares actual vs expected; fails if actual < expected. - - Notes: - - Regex assumes the standard BabelStream output layout: - " " - - Values are interpreted as floats; ensure the configuration provides numeric-like strings. - - """ - for node in out_dict.keys(): - pattern = r"Copy\s+([0-9\.]+)\s+[0-9\.]+\s+[0-9\.]+\s+" - copy_list = re.findall( pattern, out_dict[node] ) - for copy_val in copy_list: - if float(copy_val) < float(exp_dict['copy']): - fail_test(f"Copy value {copy_val} less than expected {exp_dict['copy']} on node {node}") - pattern = r"Add\s+([0-9\.]+)\s+[0-9\.]+\s+[0-9\.]+\s+" - add_list = re.findall( pattern, out_dict[node] ) - for add_val in add_list: - if float(add_val) < float(exp_dict['add']): - fail_test(f"Add value {add_val} less than expected {exp_dict['add']} on node {node}") - pattern = r"Mul\s+([0-9\.]+)\s+[0-9\.]+\s+[0-9\.]+\s+" - mul_list = re.findall( pattern, out_dict[node] ) - for mul_val in mul_list: - if float(mul_val) < float(exp_dict['mul']): - fail_test(f"Mul value {mul_val} less than expected {exp_dict['mul']} on node {node}") - pattern = r"Triad\s+([0-9\.]+)\s+[0-9\.]+\s+[0-9\.]+\s+" - triad_list = re.findall( pattern, out_dict[node] ) - for triad_val in triad_list: - if float(triad_val) < float(exp_dict['triad']): - fail_test(f"Triad value {triad_val} less than expected {exp_dict['triad']} on node {node}") - pattern = r"Dot\s+([0-9\.]+)\s+[0-9\.]+\s+[0-9\.]+\s+" - dot_list = re.findall( pattern, out_dict[node] ) - for dot_val in dot_list: - if float(dot_val) < float(exp_dict['dot']): - fail_test(f"Triad value {dot_val} less than expected {exp_dict['dot']} on node {node}") - - - - -@pytest.fixture(scope="module") -def shdl(cluster_dict): - """ - Build and return a parallel SSH handle (Pssh) for the head node only. - - Args: - cluster_dict (dict): Cluster metadata fixture (see phdl docstring). - - Returns: - Pssh: Handle configured for the first node (head node) in node_dict. - - Notes: - - Useful when commands should be executed only from a designated head node. - - Module scope ensures a single connection context for the duration of the module. - - nhdl_dict is currently unused; it can be removed unless used elsewhere. - """ - nhdl_dict = {} - node_list = list(cluster_dict['node_dict'].keys()) - head_node = node_list[0] - shdl = Pssh( log, [head_node], user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return shdl - - - - -@pytest.fixture(scope="module") -def phdl(cluster_dict): - """ - Create a parallel SSH handle (Pssh) for executing commands across all cluster nodes. - - Args: - cluster_dict (dict): Cluster metadata containing at least: - - node_dict: mapping of node name/IP -> details - - username: SSH username for nodes - - priv_key_file: path to SSH private key - - Returns: - Pssh: A handle that runs commands in parallel and returns a dict of node -> output. - - """ - print(cluster_dict) - node_list = list(cluster_dict['node_dict'].keys()) - phdl = Pssh( log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return phdl - - - - -def test_create_wrapper_script( phdl, shdl, config_dict ): - """ - Create a wrapper script to run hip-stream with device bound to MPI rank. - - Script content: - #!/bin/bash - /hip-stream --device $OMPI_COMM_WORLD_RANK -n 50 -s 268435456 - - Steps: - 1) Create wrapper.sh at the configured path. - 2) Verify the file exists. - 3) chmod +x wrapper.sh - 4) Update test result. - - Args: - hdl: Single-node SSH handler used to write files and update permissions on the head node. - config_dict (dict): Contains 'path' for the wrapper location. - - Notes: - - -n and -s parameters can be tuned via config to adjust runtime and problem size. - """ - - globals.error_list = [] - log.info('Testcase create hip-stream wrapper-script') - path = config_dict['path'] - - if config_dict['nfs_install'] is True: - hdl = shdl - else: - hdl = phdl - - out_dict = hdl.exec(f'cd {path};ls -l') - - print(f"echo -e '#!/bin/bash\n{path}/hip-stream --device $OMPI_COMM_WORLD_RANK -n 50 -s 268435456' > {path}/wrapper.sh") - out_dict = hdl.exec(f"echo -e '#!/bin/bash\\n{path}/hip-stream --device $OMPI_COMM_WORLD_RANK -n 50 -s 268435456' > {path}/wrapper.sh" ) - for node in out_dict.keys(): - print(out_dict[node]) - - time.sleep(1) - - out_dict = hdl.exec(f'cat {path}/wrapper.sh') - out_dict = phdl.exec(f'ls -l {path}/wrapper.sh') - for node in out_dict.keys(): - if re.search('No such file', out_dict[node], re.I ): - fail_test(f'Creation of wrapper script failed, file not found or content missing on node {node}' ) - out_dict = hdl.exec(f'chmod 755 {path}/wrapper.sh') - update_test_result() - - - - -def test_run_babelstream(phdl, config_dict, ): - """ - Run BabelStream across 8 MPI ranks (GPUs) and validate output/error patterns and performance presence. - - Args: - phdl: Parallel SSH handle to execute commands on nodes. - config_dict (dict): BabelStream configuration with: - - 'path': Directory containing wrapper.sh (created earlier). - - 'results': Expected performance thresholds per kernel for post-parse validation. - - Behavior: - - Resets global error list and logs the start. - - Changes to the configured path and launches the wrapper over 8 ranks using mpiexec. - - Scans outputs per node for generic failure patterns (fail|error|fatal|core|crash). - - Ensures expected performance lines (e.g., 'Triad') are present to confirm proper run. - - Invokes parse_babelstream_results to compare measured bandwidths against thresholds. - - Finalizes test status with update_test_result. - - Assumptions: - - wrapper.sh exists, is executable, and binds device selection to MPI rank. - - parse_babelstream_results and update_test_result utilities are available. - - Timeout (120s) is sufficient for your platform/workload; adjust as needed. - """ - globals.error_list = [] - log.info('Testcase Run babelstream on all 8 GPUs') - path = config_dict['path'] - exp_dict = config_dict['results'] - out_dict = phdl.exec(f'cd {path};mpiexec --allow-run-as-root -n 8 ./wrapper.sh', timeout=(60*2)) - for node in out_dict.keys(): - if re.search( 'fail|error|fatal|core|crash', out_dict[node], re.I ): - fail_test(f'Failure error patterns seen in babelstream test on node {node}') - if not re.search( 'Triad', out_dict[node], re.I ): - fail_test(f'Expected performance number outputs not printed in babelstream out on node {node} - Test Failed') - parse_babelstream_results( out_dict, exp_dict ) - update_test_result() diff --git a/cvs/tests/health/install/install_babelstream.py b/cvs/tests/health/install/install_babelstream.py deleted file mode 100644 index 7ad92ce..0000000 --- a/cvs/tests/health/install/install_babelstream.py +++ /dev/null @@ -1,246 +0,0 @@ -''' -Copyright 2025 Advanced Micro Devices, Inc. -All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. -The year included in the foregoing notice is the year of creation of the work. -All code contained here is Property of Advanced Micro Devices, Inc. -''' - -import pytest - -import re -import sys -import os -import sys -import time -import json -import logging -import time - - -from cvs.lib.parallel_ssh_lib import * -from cvs.lib.utils_lib import * - -from cvs.lib import globals - -log = globals.log - - -# Importing additional cmd line args to script .. -@pytest.fixture(scope="module") -def cluster_file(pytestconfig): - """ - Retrieve the --cluster_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the cluster configuration JSON file. - """ - return pytestconfig.getoption("cluster_file") - - -@pytest.fixture(scope="module") -def config_file(pytestconfig): - """ - Retrieve the --config_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the test configuration JSON file. - - Notes: - - Ensure your pytest invocation includes: --config_file=/path/to/config.json - - Module scope ensures this runs once per module to avoid repeated lookups. - """ - return pytestconfig.getoption("config_file") - - -# Importing the cluster and cofig files to script to access node, switch, test config params -@pytest.fixture(scope="module") -def cluster_dict(cluster_file): - """ - Load and return the entire cluster configuration from JSON. - - Args: - cluster_file (str): Path to the cluster JSON file. - - Returns: - dict: Parsed cluster configuration (nodes, credentials, etc.). - """ - with open(cluster_file) as json_file: - cluster_dict = json.load(json_file) - - # Resolve path placeholders like {user-id} in cluster config - cluster_dict = resolve_cluster_config_placeholders(cluster_dict) - - log.info(cluster_dict) - return cluster_dict - -@pytest.fixture(scope="module") -def config_dict(config_file, cluster_dict): - """ - Load and return the 'babelstream' subsection from the test configuration JSON. - - Args: - config_file (str): Path to the test configuration JSON. - - Returns: - dict: The 'babelstream' configuration block, expected to include: - - path: location where hip-stream (BabelStream HIP binary) will live - - git_install_path: directory to clone and build BabelStream - - git_url: BabelStream repository URL - - results: expected performance thresholds for kernels (copy/add/mul/triad/dot) - """ - with open(config_file) as json_file: - config_dict_t = json.load(json_file) - config_dict = config_dict_t['babelstream'] - - # Resolve path placeholders like {user-id}, {home-mount-dir}, etc. - config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) - - log.info(config_dict) - return config_dict - - - - - -@pytest.fixture(scope="module") -def shdl(cluster_dict): - """ - Build and return a parallel SSH handle (Pssh) for the head node only. - - Args: - cluster_dict (dict): Cluster metadata fixture (see phdl docstring). - - Returns: - Pssh: Handle configured for the first node (head node) in node_dict. - - Notes: - - Useful when commands should be executed only from a designated head node. - - Module scope ensures a single connection context for the duration of the module. - - nhdl_dict is currently unused; it can be removed unless used elsewhere. - """ - nhdl_dict = {} - node_list = list(cluster_dict['node_dict'].keys()) - head_node = node_list[0] - shdl = Pssh( log, [head_node], user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return shdl - - - -@pytest.fixture(scope="module") -def phdl(cluster_dict): - """ - Create a parallel SSH handle (Pssh) for executing commands across all cluster nodes. - - Args: - cluster_dict (dict): Cluster metadata containing at least: - - node_dict: mapping of node name/IP -> details - - username: SSH username for nodes - - priv_key_file: path to SSH private key - - Returns: - Pssh: A handle that runs commands in parallel and returns a dict of node -> output. - - """ - print(cluster_dict) - node_list = list(cluster_dict['node_dict'].keys()) - phdl = Pssh( log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return phdl - - - - - - -@pytest.mark.dependency(name="init") -def test_install_babelstream( phdl, shdl, config_dict ): - """ - Install BabelStream (HIP model) across nodes if not already present, and verify the hip-stream binary. - - Steps: - 1) Check if the designated path exists on the head node. - 2) If absent, on all nodes: - - git clone the repository, - - configure with cmake (MODEL=hip, compiler=hipcc), - - build the project. - 3) Verify 'hip-stream' is present on all nodes. - 4) Export the build path into PATH for the current shell context. - 5) Update the test result status. - - Args: - hdl: Single-node SSH handler for quick checks on head node (NFS/shared path). - phdl: Parallel SSH handler for running commands on all nodes. - config_dict (dict): Contains: - - path: where hip-stream will reside - - git_install_path: directory to clone/build BabelStream - - git_url: repo URL to clone from - - """ - globals.error_list = [] - log.info('Testcase install babelstream') - path = config_dict['path'] - git_install_path = config_dict['git_install_path'] - git_url = config_dict['git_url'] - print(git_install_path) - if config_dict['nfs_install'] is True: - hdl = shdl - else: - hdl = phdl - - out_dict = shdl.exec( f'ls -l {path}') - for node in out_dict.keys(): - output = out_dict[node] - if re.search( 'No such file', output, re.I ): - out_dict = hdl.exec(f'cd {git_install_path};git clone {git_url};cd') - out_dict = hdl.exec(f'cd {git_install_path}/BabelStream;cmake -Bbuild -H. -DMODEL=hip -DCMAKE_CXX_COMPILER=hipcc') - out_dict = hdl.exec(f'cd {git_install_path}/BabelStream;cmake --build build') - out_dict = phdl.exec(f'ls -l {path}') - for node in out_dict.keys(): - if not re.search('hip-stream', out_dict[node], re.I ): - fail_test(f'Installation of BabelStream failed, hip-stream file not found on node {node}' ) - phdl.exec(f'export PATH={git_install_path}/BabelStream/build:$PATH') - update_test_result() - - - - -def test_install_open_mpi(phdl, config_dict, ): - """ - Install Open MPI across all nodes and verify that mpiexec is available. - - Args: - phdl: Parallel SSH handle capable of executing commands on all nodes. Expected to - return a dict mapping node -> command output for each exec call. - config_dict (dict): Test configuration. Includes: - - 'path': Base path used elsewhere; not directly used here but kept for consistency. - - Behavior: - - Resets the global error list for a clean test run. - - Updates package indexes (apt) and installs Open MPI components. - - Verifies installation by checking for 'mpiexec' on each node. - - Records failures via fail_test and finalizes the test status via update_test_result. - - Assumptions: - - Target systems are Debian/Ubuntu-based (uses apt/apt-get). Adapt for RHEL/CentOS if needed. - - phdl.exec supports a 'timeout' parameter and returns a dict of node outputs. - - fail_test and update_test_result are available in the test environment. - """ - globals.error_list = [] - log.info('Testcase install openmpi') - if config_dict['nfs_install'] is True: - hdl = shdl - else: - hdl = phdl - path = config_dict['path'] - out_dict = phdl.exec(f'sudo apt update -y', timeout=200) - out_dict = phdl.exec(f'sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev', timeout=200) - out_dict = phdl.exec('which mpiexec') - for node in out_dict.keys(): - if not re.search( 'mpiexec', out_dict[node] ): - fail_test(f'Open MPI installation failed on node {node}') - update_test_result() diff --git a/cvs/tests/health/install/install_rocblas.py b/cvs/tests/health/install/install_rocblas.py deleted file mode 100644 index ec7f3d0..0000000 --- a/cvs/tests/health/install/install_rocblas.py +++ /dev/null @@ -1,222 +0,0 @@ -''' -Copyright 2025 Advanced Micro Devices, Inc. -All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. -The year included in the foregoing notice is the year of creation of the work. -All code contained here is Property of Advanced Micro Devices, Inc. -''' - -import pytest - -import re -import sys -import os -import sys -import time -import json -import logging -import time - - -from cvs.lib.parallel_ssh_lib import * -from cvs.lib.utils_lib import * - -from cvs.lib import globals - -log = globals.log - - -# Importing additional cmd line args to script .. -@pytest.fixture(scope="module") -def cluster_file(pytestconfig): - """ - Retrieve the --cluster_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the cluster configuration JSON file. - """ - return pytestconfig.getoption("cluster_file") - - - - -@pytest.fixture(scope="module") -def config_file(pytestconfig): - """ - Retrieve the --config_file CLI option provided to pytest. - - Args: - pytestconfig: Built-in pytest fixture exposing command-line options. - - Returns: - str: Path to the test configuration JSON file. - """ - return pytestconfig.getoption("config_file") - - - - -@pytest.fixture(scope="module") -def cluster_dict(cluster_file): - """ - Load and return the entire cluster configuration. - - Args: - cluster_file (str): Path to the cluster configuration JSON (from cluster_file fixture). - - Returns: - dict: Parsed cluster configuration (nodes, credentials, etc). - - """ - with open(cluster_file) as json_file: - cluster_dict = json.load(json_file) - - # Resolve path placeholders like {user-id} in cluster config - cluster_dict = resolve_cluster_config_placeholders(cluster_dict) - - log.info(cluster_dict) - return cluster_dict - - - - -@pytest.fixture(scope="module") -def config_dict(config_file, cluster_dict): - """ - Load and return the rocBLAS test configuration subsection. - - Args: - config_file (str): Path to the test configuration JSON. - - Returns: - dict: The 'rocblas' configuration block, expected to include expected GFLOP thresholds - and other test settings specific to rocBLAS runs. - - """ - with open(config_file) as json_file: - config_dict_t = json.load(json_file) - config_dict = config_dict_t['rocblas'] - - # Resolve path placeholders like {user-id}, {home-mount-dir}, etc. - config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) - - log.info(config_dict) - return config_dict - - - - -@pytest.fixture(scope="module") -def shdl(cluster_dict): - """ - Build and return a parallel SSH handle (Pssh) for the head node only. - - Args: - cluster_dict (dict): Cluster metadata fixture (see phdl docstring). - - Returns: - Pssh: Handle configured for the first node (head node) in node_dict. - - Notes: - - Useful when commands should be executed only from a designated head node. - - Module scope ensures a single connection context for the duration of the module. - - nhdl_dict is currently unused; it can be removed unless used elsewhere. - """ - nhdl_dict = {} - node_list = list(cluster_dict['node_dict'].keys()) - head_node = node_list[0] - shdl = Pssh( log, [head_node], user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return shdl - - - - - -@pytest.fixture(scope="module") -def phdl(cluster_dict): - """ - Build a parallel SSH handle (Pssh) for the entire cluster. - - Args: - cluster_dict (dict): Cluster config that must include: - - 'node_dict': mapping of node_identifier -> details - - 'username': SSH username - - 'priv_key_file': path to SSH private key - - Returns: - Pssh: A handle that executes commands across all nodes and returns dict[node] -> output. - - """ - nhdl_dict = {} - print(cluster_dict) - node_list = list(cluster_dict['node_dict'].keys()) - phdl = Pssh( log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return phdl - - - - - -@pytest.mark.dependency(name="init") -def test_rocblas_install( phdl, shdl, config_dict, ): - """ - Install rocBLAS (clients only) from source and verify rocblas-bench presence. - - Args: - hdl: Single-node SSH handler used for quick checks on the head node. - phdl: Parallel SSH handler to run commands across all nodes. - config_dict (dict): Must include: - - 'path': final location expected to contain rocblas-bench - - 'git_install_path': directory to clone/build rocBLAS - - 'rocm_version': repo tag/branch to checkout (e.g., '6.2') - - 'git_url': repository URL for rocBLAS - - Steps: - - Install build prerequisites via apt. - - Init git environment and clone rocBLAS into git_install_path. - - Checkout rocm-{rocm_version}. - - Run install.sh --clients-only --library-path /opt/rocm. - - Verify rocblas-bench exists under config_dict['path'] on all nodes. - - update_test_result() to finalize test status. - """ - - globals.error_list = [] - print('Testcase install rocblas') - - if config_dict['nfs_install'] is True: - hdl = shdl - else: - hdl = phdl - - - path = config_dict['path'] - git_install_path = config_dict['git_install_path'] - rocm_version = config_dict['rocm_version'] - hdl.exec(f'sudo rm -rf {git_install_path}/rocBLAS') - - git_url = config_dict['git_url'] - out_dict = hdl.exec('sudo apt update -y', timeout=200) - out_dict = hdl.exec('sudo apt install -y libgtest-dev', timeout=200) - out_dict = hdl.exec('sudo apt install -y cmake', timeout=200) - out_dict = hdl.exec('sudo apt install -y gfortran', timeout=200) - out_dict = hdl.exec('sudo apt install -y hipblaslt-dev', timeout=200) - - time.sleep(2) - #out_dict = phdl.exec('git init') - out_dict = hdl.exec(f'cd {git_install_path};git clone {git_url}', timeout=100 ) - time.sleep(2) - - out_dict = hdl.exec(f'cd {git_install_path}/rocBLAS;git checkout rocm-{rocm_version}', timeout=60) - - time.sleep(2) - #out_dict = hdl.exec(f'cd {git_install_path}/rocBLAS;./install.sh --clients-only --library-path /opt/rocm-{rocm_version}', timeout=700 ) - out_dict = phdl.exec(f'cd {git_install_path}/rocBLAS;sudo ./install.sh -dc --clients-only --library-path /opt/rocm-{rocm_version}', timeout=700 ) - out_dict = phdl.exec(f'ls -l {path}') - for node in out_dict.keys(): - if not re.search('rocblas-bench', out_dict[node], re.I ): - fail_test(f'rocblas installation failed, rocblas-bench not found on node {node}') - update_test_result() - - diff --git a/cvs/tests/health/install/install_rvs.py b/cvs/tests/health/install/install_rvs.py index 5ed0dc3..d001320 100644 --- a/cvs/tests/health/install/install_rvs.py +++ b/cvs/tests/health/install/install_rvs.py @@ -15,10 +15,17 @@ import json import logging + from cvs.lib.parallel_ssh_lib import * from cvs.lib.utils_lib import * from cvs.lib.verify_lib import * - +from cvs.lib.linux_utils import ( + detect_distro, + install_package, + update_package_cache, + translate_package_name, + map_packages +) from cvs.lib import globals log = globals.log @@ -202,13 +209,23 @@ def test_install_rvs(phdl, shdl, config_dict): # If RVS is not found or configs are missing, install it if not rvs_found or not config_found: log.info('RVS not found, attempting to install from artifactory repo first') - # First try to install from artifactory repo package_installed = False - out_dict = hdl.exec('sudo apt-get update -y', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y libpci3 libpci-dev doxygen unzip cmake git libyaml-cpp-dev', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y rocblas rocm-smi-lib', timeout=600) - out_dict = hdl.exec('sudo apt-get install -y rocm-validation-suite', timeout=600) + packages = ['libpci3', 'libpci-dev', 'doxygen', 'unzip', 'cmake', + 'git', 'libyaml-cpp-dev', 'rocblas', 'rocm-smi-lib', 'rocm-validation-suite'] + distro = detect_distro(hdl) + package_list = map_packages(distro, packages) + out_dict = update_package_cache(hdl, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + for package in package_list: + out_dict = install_package(hdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') + for node in out_dict.keys(): if re.search('Unable to locate package|Package.*not found|E: Could not get lock|dpkg: error', out_dict[node], re.I): diff --git a/cvs/tests/health/rocblas_cvs.py b/cvs/tests/health/rocblas_cvs.py deleted file mode 100644 index ca4a03b..0000000 --- a/cvs/tests/health/rocblas_cvs.py +++ /dev/null @@ -1,200 +0,0 @@ -''' -Copyright 2025 Advanced Micro Devices, Inc. -All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply publication or any waiver of confidentiality. -The year included in the foregoing notice is the year of creation of the work. -All code contained here is Property of Advanced Micro Devices, Inc. -''' - -import pytest - -import re -import sys -import os -import sys -import time -import json -import logging -import time - -import netmiko -from netmiko import ConnectHandler -from netmiko import redispatch - - -from cvs.lib.parallel_ssh_lib import * -from cvs.lib.utils_lib import * - -from cvs.lib import globals - -log = globals.log - - -# Importing additional cmd line args to script .. -@pytest.fixture(scope="module") -def cluster_file(pytestconfig): - return pytestconfig.getoption("cluster_file") - - -@pytest.fixture(scope="module") -def config_file(pytestconfig): - return pytestconfig.getoption("config_file") - - -# Importing the cluster and cofig files to script to access node, switch, test config params -@pytest.fixture(scope="module") -def cluster_dict(cluster_file): - with open(cluster_file) as json_file: - cluster_dict = json.load(json_file) - - # Resolve path placeholders like {user-id} in cluster config - cluster_dict = resolve_cluster_config_placeholders(cluster_dict) - - log.info(cluster_dict) - return cluster_dict - -@pytest.fixture(scope="module") -def config_dict(config_file, cluster_dict): - with open(config_file) as json_file: - config_dict_t = json.load(json_file) - config_dict = config_dict_t['rocblas'] - - # Resolve path placeholders like {user-id}, {home-mount-dir}, etc. - config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) - - log.info(config_dict) - return config_dict - - -def parse_rocblas_fp32( out_dict, exp_dict, ): - for node in out_dict.keys(): - match = re.search( r'N,T,4000,4000,4000,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,\s+[0-9\.]+,\s+[0-9\.]+,\s+([0-9\.]+),\s+[0-9\.]+', out_dict[node] ) - fp32_gflops = float(match.group(1)) - if float(fp32_gflops) < float(exp_dict['fp32_gflops']): - fail_test(f"Node {node} Actual GFLOPs for rocblas with FP32 {fp32_gflops} is lower than the expected GFLOPs {exp_dict['fp32_gflops']}") - - - -def parse_rocblas_bf16( out_dict, exp_dict ): - for node in out_dict.keys(): - match = re.search( r'N,T,1024,2048,512,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,\s+[0-9\.]+,\s+[0-9\.]+,\s+([0-9\.]+),\s+[0-9\.]+', out_dict[node] ) - bf16_gflops = float(match.group(1)) - if float(bf16_gflops) < float(exp_dict['bf16_gflops']): - fail_test(f"Node {node} Actual GFLOPs for rocblas with BF16 {bf16_gflops} is lower than the expected GFLOPs {exp_dict['bf16_gflops']}") - - -def parse_rocblas_int8( out_dict, exp_dict ): - for node in out_dict.keys(): - match = re.search( r'N,T,1024,2048,512,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,[0-9\.]+,\s+[0-9\.]+,\s+[0-9\.]+,\s+([0-9\.]+),\s+[0-9\.]+', out_dict[node] ) - int8_gflops = float(match.group(1)) - if float(int8_gflops) < float(exp_dict['int8_gflops']): - fail_test(f"Node {node} Actual GFLOPs for rocblas with INT8 {int8_gflops} is lower than the expected GFLOPs {exp_dict['int8_gflops']}") - - - - - -# Create connection to DUT, Switches and export for later use .. -@pytest.fixture(scope="module") -def phdl(cluster_dict): - nhdl_dict = {} - print(cluster_dict) - node_list = list(cluster_dict['node_dict'].keys()) - phdl = Pssh( log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'] ) - return phdl - - -# Connect to first node to install packages in NFS mounted common directories -@pytest.fixture(scope="module") -def hdl(cluster_dict): - node_list = list(cluster_dict['node_dict'].keys()) - hdl = ConnectHandler( ip=node_list[0], device_type='linux', username=cluster_dict['username'], \ - use_keys=True, key_file=cluster_dict['priv_key_file'] ) - out = hdl.send_command('pwd') - log.info(out) - return hdl - - - -@pytest.mark.dependency(name="init") -def test_rocblas_install( hdl, phdl, config_dict, ): - globals.error_list = [] - log.info('Testcase install rocblas') - log.info(f'{config_dict}') - path = config_dict['path'] - package_path = config_dict['package_path'] - rocm_version = config_dict['rocm_version'] - phdl.exec('sudo rm -rf /home/venksrin/rocBLAS') - time.sleep(5) - git_url = config_dict['git_url'] - out_dict = phdl.exec('sudo apt update -y', timeout=200) - out_dict = phdl.exec('sudo apt install -y libgtest-dev', timeout=200) - out_dict = phdl.exec('sudo apt install -y cmake', timeout=200) - out_dict = phdl.exec('sudo apt install -y gfortran', timeout=200) - time.sleep(3) - log.info(out_dict) - log.info(f'Inputs - {package_path}, {path}, {git_url}') - print('%%%%%%%%%%%%%%%%%') - print('%%%%%%%%%%%%%%%%%') - #hdl.write_channel('git init') - #hdl.write_channel(f'cd {package_path};git clone {git_url};cd\r\r' ) - print(f"cmd = git init") - out_dict = phdl.exec('git init') - time.sleep(5) - print(f'cmd = cd {package_path};git clone {git_url};cd') - out_dict = phdl.exec(f'cd {package_path};git clone {git_url};cd', timeout=100 ) - time.sleep(10) - - #hdl.write_channel(f'cd {package_path}/rocBLAS;git checkout rocm-{rocm_version};cd\r\r') - print(f'cmd = cd {package_path}/rocBLAS;git checkout rocm-{rocm_version}') - out_dict = phdl.exec(f'cd {package_path}/rocBLAS;git checkout rocm-{rocm_version}', timeout=60) - time.sleep(10) - #time.sleep(30) - #hdl.write_channel(f'cd {package_path}/rocBLAS;./install.sh --clients-only --library-path /opt/rocm-{rocm_version}\r\r') - print(f'cmd = cd {package_path}/rocBLAS;./install.sh --clients-only --library-path /opt/rocm') - out_dict = phdl.exec(f'cd {package_path}/rocBLAS;./install.sh --clients-only --library-path /opt/rocm', timeout=700 ) - out_dict = phdl.exec(f'ls -l {path}') - for node in out_dict.keys(): - if not re.search('rocblas-bench', out_dict[node], re.I ): - fail_test(f'rocblas installation failed, rocblas-bench not found, aborting !! {node}') - update_test_result() - - - -@pytest.mark.dependency(depends=["init"]) -def test_rocblas_fp32_benchmark(phdl, config_dict, ): - globals.error_list = [] - log.info('Testcase Run rocblas FP32 benchmark') - path = config_dict['path'] - out_dict = phdl.exec(f'sudo {path}/rocblas-bench -f gemm -r s -m 4000 -n 4000 -k 4000 --lda 4000 --ldb 4000 --ldc 4000 --transposeA N --transposeB T', timeout=(60*5)) - print_test_output( log, out_dict ) - parse_rocblas_fp32( out_dict, config_dict['results'] ) - scan_test_results( out_dict ) - update_test_result() - - -@pytest.mark.dependency(depends=["init"]) -def test_rocblas_bf16_benchmark(phdl, config_dict, ): - globals.error_list = [] - log.info('Testcase Run rocblas BF16 benchmark') - path = config_dict['path'] - out_dict = phdl.exec(f'sudo {path}/rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1024 -n 2048 -k 512 --a_type h --lda 1024 --stride_a 4096 --b_type h --ldb 2048 --stride_b 4096 --c_type s --ldc 1024 --stride_c 2097152 --d_type s --ldd 1024 --stride_d 2097152 --compute_type s --alpha 1.1 --beta 1 --batch_count 5', timeout=(60*5)) - print_test_output( log, out_dict ) - parse_rocblas_bf16( out_dict, config_dict['results'] ) - scan_test_results( out_dict ) - update_test_result() - - - -@pytest.mark.dependency(depends=["init"]) -def test_rocblas_int8_benchmark(phdl, config_dict, ): - globals.error_list = [] - log.info('Testcase Run rocblas INT8 benchmark') - path = config_dict['path'] - out_dict = phdl.exec(f'sudo {path}/rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 1024 -n 2048 -k 512 --a_type i8_r --lda 1024 --stride_a 4096 --b_type i8_r --ldb 2048 --stride_b 4096 --c_type i32_r --ldc 1024 --stride_c 2097152 --d_type i32_r --ldd 1024 --stride_d 2097152 --compute_type i32_r --alpha 1.1 --beta 1 --batch_count 5', timeout=(60*5)) - print_test_output( log, out_dict ) - parse_rocblas_int8( out_dict, config_dict['results'] ) - scan_test_results( out_dict ) - update_test_result() - - - diff --git a/cvs/tests/ibperf/install_ibperf_tools.py b/cvs/tests/ibperf/install_ibperf_tools.py index ce094cf..77c1cfc 100644 --- a/cvs/tests/ibperf/install_ibperf_tools.py +++ b/cvs/tests/ibperf/install_ibperf_tools.py @@ -22,7 +22,12 @@ from cvs.lib.parallel_ssh_lib import * from cvs.lib.utils_lib import * from cvs.lib.verify_lib import * - +from cvs.lib.linux_utils import ( + detect_distro, + install_package, + update_package_cache, + map_packages +) from cvs.lib import globals log = globals.log @@ -210,12 +215,23 @@ def test_install_ib_perf(phdl, shdl, config_dict ): if re.search( 'true', config_dict['install_perf_package'], re.I ): shdl.exec( f'mkdir -p {config_dict["install_dir"]}') - phdl.exec( 'sudo apt update -y', timeout=200 ) - phdl.exec( 'sudo apt install -y git build-essential autoconf automake libtool pkg-config', timeout=200 ) - phdl.exec( 'sudo apt install -y libibverbs-dev librdmacm-dev ibverbs-providers rdma-core', timeout=200 ) - phdl.exec( 'sudo apt install -y libibumad-dev' ) - phdl.exec( 'sudo apt install -y libpci-dev' ) - phdl.exec( 'sudo apt install -y numactl' ) + distro = detect_distro(phdl) + out_dict = update_package_cache(phdl, distro, timeout=600) + # Check for errors if needed + for node in out_dict.keys(): + if re.search('error|failed', out_dict[node], re.I): + log.warning(f'Package update warning on {node}') + + packages = ['git', 'build-essential', 'autoconf', 'automake', 'libtool', + 'pkg-config', 'libibverbs-dev', 'librdmacm-dev', + 'ibverbs-providers', 'rdma-core', 'libibumad-dev', + 'libpci-dev', 'numactl'] + package_list = map_packages(distro, packages) + for package in package_list: + out_dict = install_package(phdl, package, distro, timeout=200) + for node in out_dict.keys(): + if re.search('error|failed|unable to locate', out_dict[node], re.I): + fail_test(f'Failed to install {package} on {node}') shdl.exec( f'cd {config_dict["install_dir"]}; git clone https://github.com/linux-rdma/perftest' ) shdl.exec( f'cd {config_dict["install_dir"]}/perftest; ./autogen.sh', timeout=100 ) shdl.exec( f'cd {config_dict["install_dir"]}/perftest; ./configure --prefix={config_dict["install_dir"]}/perftest --with-rocm={config_dict["rocm_dir"]} --enable-rocm', timeout=200 )