diff --git a/.github/workflows/generate_matrix_page.yaml b/.github/workflows/generate_matrix_page.yaml index 0b614398..10b12250 100644 --- a/.github/workflows/generate_matrix_page.yaml +++ b/.github/workflows/generate_matrix_page.yaml @@ -32,8 +32,12 @@ jobs: steps: - name: Set dynamic env vars run: | + # GPU Operator dashboard paths echo "DASHBOARD_DATA_FILEPATH=${DASHBOARD_OUTPUT_DIR}/gpu_operator_matrix.json" >> "$GITHUB_ENV" echo "DASHBOARD_HTML_FILEPATH=${DASHBOARD_OUTPUT_DIR}/gpu_operator_matrix.html" >> "$GITHUB_ENV" + # Network Operator dashboard paths + echo "NNO_DASHBOARD_DATA_FILEPATH=${DASHBOARD_OUTPUT_DIR}/network_operator_matrix.json" >> "$GITHUB_ENV" + echo "NNO_DASHBOARD_HTML_FILEPATH=${DASHBOARD_OUTPUT_DIR}/network_operator_matrix.html" >> "$GITHUB_ENV" echo "GH_PAGES_BRANCH=${{ github.event.inputs.gh_pages_branch || 'gh-pages' }}" >> "$GITHUB_ENV" env: DASHBOARD_OUTPUT_DIR: ${{ env.DASHBOARD_OUTPUT_DIR }} @@ -67,27 +71,62 @@ jobs: - name: Install Dependencies run: | pip install -r workflows/gpu_operator_dashboard/requirements.txt + pip install -r workflows/nno_dashboard/requirements.txt - name: Fetch CI Data run: | echo "Processing PR: ${{ steps.determine_pr.outputs.PR_NUMBER }}" + # GPU Operator python -m workflows.gpu_operator_dashboard.fetch_ci_data \ --pr_number "${{ steps.determine_pr.outputs.PR_NUMBER }}" \ --baseline_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" \ --merged_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" + # Network Operator + python -m workflows.nno_dashboard.fetch_ci_data \ + --pr_number "${{ steps.determine_pr.outputs.PR_NUMBER }}" \ + --baseline_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" \ + --merged_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" - name: Generate HTML Dashboard (only if JSON changed) run: | cd "${{ env.DASHBOARD_OUTPUT_DIR }}" + + # Check if GPU Operator JSON changed + GPU_CHANGED=false if [[ ${{ github.event_name }} == "pull_request_target" ]] && git diff --exit-code gpu_operator_matrix.json; then - echo "no changes" + echo "GPU Operator: no changes" + else + echo "GPU Operator: changes detected" + GPU_CHANGED=true + fi + + # Check if Network Operator JSON changed + NNO_CHANGED=false + if [[ ${{ github.event_name }} == "pull_request_target" ]] && git diff --exit-code network_operator_matrix.json; then + echo "Network Operator: no changes" else - cd "${{ github.workspace }}" + echo "Network Operator: changes detected" + NNO_CHANGED=true + fi + + cd "${{ github.workspace }}" + + # Generate GPU Operator dashboard if changed + if [ "$GPU_CHANGED" = true ]; then + echo "Generating GPU Operator dashboard..." python -m workflows.gpu_operator_dashboard.generate_ci_dashboard \ --dashboard_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" \ --dashboard_html_filepath "${{ env.DASHBOARD_HTML_FILEPATH }}" fi + + # Generate Network Operator dashboard if changed + if [ "$NNO_CHANGED" = true ]; then + echo "Generating Network Operator dashboard..." + python -m workflows.nno_dashboard.generate_ci_dashboard \ + --dashboard_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" \ + --dashboard_html_filepath "${{ env.NNO_DASHBOARD_HTML_FILEPATH }}" + fi - name: Deploy HTML to GitHub Pages uses: JamesIves/github-pages-deploy-action@v4 diff --git a/workflows/gpu_operator_dashboard/fetch_ci_data.py b/workflows/gpu_operator_dashboard/fetch_ci_data.py index d2e1f823..bdd6d70a 100644 --- a/workflows/gpu_operator_dashboard/fetch_ci_data.py +++ b/workflows/gpu_operator_dashboard/fetch_ci_data.py @@ -92,15 +92,20 @@ class TestResult: test_status: str prow_job_url: str job_timestamp: str + test_flavor: Optional[str] = None # NNO-specific: test configuration flavor def to_dict(self) -> Dict[str, Any]: - return { + result = { OCP_FULL_VERSION: self.ocp_full_version, GPU_OPERATOR_VERSION: self.gpu_operator_version, "test_status": self.test_status, "prow_job_url": self.prow_job_url, "job_timestamp": self.job_timestamp, } + # Include test_flavor only if it's set (NNO-specific) + if self.test_flavor is not None: + result["test_flavor"] = self.test_flavor + return result def composite_key(self) -> TestResultKey: repo, pr_number, job_name, build_id = extract_build_components(self.prow_job_url) @@ -571,8 +576,15 @@ def merge_ocp_version_results( bundle_result_limit: Optional[int] = None ) -> Dict[str, Any]: """Merge results for a single OCP version.""" - # Initialize the structure - merged_version_data = {"notes": [], "bundle_tests": [], "release_tests": [], "job_history_links": []} + # Initialize the structure with all possible fields + merged_version_data = { + "notes": [], + "bundle_tests": [], + "release_tests": [], + "job_history_links": [], + "test_flavors": {} + } + # Update with existing data (preserves any additional fields) merged_version_data.update(existing_version_data) # Merge bundle tests with limit @@ -599,6 +611,31 @@ def merge_ocp_version_results( # Convert back to sorted list for JSON serialization merged_version_data["job_history_links"] = sorted(list(all_job_history_links)) + # Merge test_flavors (NNO-specific) if present + new_test_flavors = new_version_data.get("test_flavors", {}) + existing_test_flavors = merged_version_data.get("test_flavors", {}) + + # Merge test flavors by combining results for each flavor + for flavor_name, flavor_data in new_test_flavors.items(): + if flavor_name not in existing_test_flavors: + existing_test_flavors[flavor_name] = {"results": [], "job_history_links": set()} + + # Merge results for this flavor (using same logic as release_tests) + new_flavor_results = flavor_data.get("results", []) + existing_flavor_results = existing_test_flavors[flavor_name].get("results", []) + existing_test_flavors[flavor_name]["results"] = merge_release_tests( + new_flavor_results, existing_flavor_results + ) + + # Merge job history links for this flavor + new_flavor_links = flavor_data.get("job_history_links", set()) + existing_flavor_links = existing_test_flavors[flavor_name].get("job_history_links", set()) + all_flavor_links = set(existing_flavor_links if isinstance(existing_flavor_links, (set, list)) else []) + all_flavor_links.update(new_flavor_links) + existing_test_flavors[flavor_name]["job_history_links"] = sorted(list(all_flavor_links)) + + merged_version_data["test_flavors"] = existing_test_flavors + return merged_version_data diff --git a/workflows/nno_dashboard/README.md b/workflows/nno_dashboard/README.md new file mode 100644 index 00000000..bb5b59b9 --- /dev/null +++ b/workflows/nno_dashboard/README.md @@ -0,0 +1,136 @@ +# NVIDIA Network Operator Dashboard Workflow + +This workflow generates an HTML dashboard showing NVIDIA Network Operator test results across different operator versions and OpenShift versions. It fetches test data from CI systems and creates visual reports for tracking test status over time. + +## Overview + +The dashboard workflow: +- Fetches test results from Google Cloud Storage based on pull request data +- Supports various network operator test patterns including: + - `nvidia-network-operator-legacy-sriov-rdma` + - `nvidia-network-operator-e2e` + - DOCA-based tests (e.g., `doca4-nvidia-network-operator-*`) +- Merges new results with existing baseline data +- Generates HTML dashboard reports +- Automatically deploys updates to GitHub Pages + +## Architecture + +This dashboard **reuses** the GPU Operator Dashboard code and only overrides the operator-specific parts: +- ✅ Imports all core logic from `workflows.gpu_operator_dashboard.fetch_ci_data` +- ✅ Overrides only Network Operator specific: + - Regex patterns to match network operator job names + - Artifact paths (`network-operator-e2e/artifacts/`) + - Version field names (`network_operator_version` vs `gpu_operator_version`) +- ✅ Maintains a clean, DRY codebase with minimal duplication + +This design makes maintenance easier - bug fixes in the core logic automatically benefit both dashboards. + +## Supported Test Patterns + +The dashboard recognizes the following test job patterns: +- `pull-ci-rh-ecosystem-edge-nvidia-ci-main-{version}-nvidia-network-operator-legacy-sriov-rdma` +- `pull-ci-rh-ecosystem-edge-nvidia-ci-main-{version}-nvidia-network-operator-e2e` +- `rehearse-{id}-pull-ci-rh-ecosystem-edge-nvidia-ci-main-doca4-nvidia-network-operator-*` + +Example URL that will be processed: +``` +https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/pr-logs/pull/openshift_release/67673/rehearse-67673-pull-ci-rh-ecosystem-edge-nvidia-ci-main-doca4-nvidia-network-operator-legacy-sriov-rdma/1961127149603655680/ +``` + +## Usage + +### Prerequisites + +```console +pip install -r workflows/nno_dashboard/requirements.txt +``` + +**Important:** Before running fetch_ci_data.py, create the baseline data file and initialize it with an empty JSON object if it doesn't exist: + +```console +echo '{}' > nno_data.json +``` + +### Fetch CI Data + +```console +# Process a specific PR +python -m workflows.nno_dashboard.fetch_ci_data --pr_number "123" --baseline_data_filepath nno_data.json --merged_data_filepath nno_data.json + +# Process all merged PRs - limited to 100 most recent (default) +python -m workflows.nno_dashboard.fetch_ci_data --pr_number "all" --baseline_data_filepath nno_data.json --merged_data_filepath nno_data.json + +# Process with bundle result limit (keep only last 50 bundle tests per version) +python -m workflows.nno_dashboard.fetch_ci_data --pr_number "all" --baseline_data_filepath nno_data.json --merged_data_filepath nno_data.json --bundle_result_limit 50 +``` + +### Generate Dashboard + +```console +python -m workflows.nno_dashboard.generate_ci_dashboard --dashboard_data_filepath nno_data.json --dashboard_html_filepath nno_dashboard.html +``` + +The dashboard generator also **reuses** the GPU Operator dashboard code: +- Imports all HTML generation logic from `workflows.gpu_operator_dashboard.generate_ci_dashboard` +- Uses Network Operator specific templates (in `templates/` directory) +- Only aliases `NETWORK_OPERATOR_VERSION` as `GPU_OPERATOR_VERSION` for compatibility + +### Running Tests + +First, make sure `pytest` is installed. Then, run: + +```console +python -m pytest workflows/nno_dashboard/tests/ -v +``` + +## GitHub Actions Integration + +- **Automatic**: Processes merged pull requests to update the dashboard with new test results and deploys to GitHub Pages +- **Manual**: Can be triggered manually via GitHub Actions workflow dispatch + +## Data Structure + +The fetched data follows this structure: + +```json +{ + "doca4": { + "notes": [], + "bundle_tests": [ + { + "ocp_full_version": "4.16.0", + "network_operator_version": "24.10.0", + "test_status": "SUCCESS", + "prow_job_url": "https://...", + "job_timestamp": "1234567890" + } + ], + "release_tests": [...], + "job_history_links": [ + "https://prow.ci.openshift.org/job-history/gs/test-platform-results/pr-logs/directory/..." + ] + } +} +``` + +## Troubleshooting + +### No data being fetched + +1. Verify the PR number exists and has network operator test runs +2. Check that the job names match the expected patterns (see regex in fetch_ci_data.py line 36-40) +3. Ensure the test artifacts contain the required files: + - `finished.json` + - `network-operator-e2e/artifacts/ocp.version` + - `network-operator-e2e/artifacts/operator.version` + +### Regex pattern not matching + +The regex pattern is designed to match: +- Repository: `rh-ecosystem-edge_nvidia-ci` or `openshift_release` (for rehearse jobs) +- OCP version prefix: Can be `doca4`, `nno1`, or other custom prefixes +- Job suffix: Must contain `nvidia-network-operator` followed by test type + +If your job names don't match, you may need to adjust the `TEST_RESULT_PATH_REGEX` pattern in `fetch_ci_data.py`. + diff --git a/workflows/nno_dashboard/__init__.py b/workflows/nno_dashboard/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/workflows/nno_dashboard/fetch_ci_data.py b/workflows/nno_dashboard/fetch_ci_data.py new file mode 100644 index 00000000..0febde74 --- /dev/null +++ b/workflows/nno_dashboard/fetch_ci_data.py @@ -0,0 +1,531 @@ +#!/usr/bin/env python +""" +NVIDIA Network Operator CI Data Fetcher + +This module extends the GPU Operator CI data fetcher with Network Operator specific patterns. +It overrides only the operator-specific regex patterns and artifact paths while reusing +all the core logic from the GPU operator dashboard. +""" +import argparse +import json +import re +from typing import Any, Dict, List, Optional + + +from workflows.gpu_operator_dashboard.fetch_ci_data import ( + STATUS_SUCCESS, + STATUS_FAILURE, + STATUS_ABORTED, + GCS_API_BASE_URL, + GCS_MAX_RESULTS_PER_REQUEST, + http_get_json, + fetch_gcs_file_content, + build_prow_job_url, + TestResultKey, + TestResult, + + + merge_bundle_tests, + merge_release_tests, + merge_ocp_version_results, + merge_and_save_results, + int_or_none, +) +from workflows.common.utils import logger + +OCP_FULL_VERSION = "ocp_full_version" +NETWORK_OPERATOR_VERSION = "network_operator_version" +TEST_RESULT_PATH_REGEX = re.compile( + r"pr-logs/pull/(?P[^/]+)/(?P\d+)/" + r"(?P(?:rehearse-\d+-)?pull-ci-rh-ecosystem-edge-nvidia-ci-main-" + r"(?P[^/]+?)-nvidia-network-operator-[^/]+)/" + r"(?P[^/]+)" +) + +def process_closed_prs(results_by_ocp: Dict[str, Dict[str, List[Dict[str, Any]]]]) -> None: + """Retrieve and store test results for all closed PRs against the main branch.""" + logger.info("Retrieving PR history...") + url = "https://api.github.com/repos/rh-ecosystem-edge/nvidia-ci/pulls" + params = {"state": "closed", "base": "main", + "per_page": "100", "page": "1"} + headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + response_data = http_get_json(url, params=params, headers=headers) + for pr in response_data: + pr_number = str(pr["number"]) + logger.info(f"Processing PR #{pr_number}") + process_tests_for_pr(pr_number, results_by_ocp) + +def fetch_filtered_files(pr_number: str, glob_pattern: str) -> List[Dict[str, Any]]: + """Fetch files matching a specific glob pattern for a PR. + + Override: Searches in both rh-ecosystem-edge_nvidia-ci and openshift_release repositories + since rehearse jobs for network operator are often stored in openshift_release. + """ + logger.info(f"Fetching files matching pattern: {glob_pattern}") + + all_items = [] + + repositories = [ + "rh-ecosystem-edge_nvidia-ci", + "openshift_release" + ] + + for repo in repositories: + params = { + "prefix": f"pr-logs/pull/{repo}/{pr_number}/", + "alt": "json", + "matchGlob": glob_pattern, + "maxResults": str(GCS_MAX_RESULTS_PER_REQUEST), + "projection": "noAcl", + } + headers = {"Accept": "application/json"} + + next_page_token = None + + while True: + if next_page_token: + params["pageToken"] = next_page_token + + try: + response_data = http_get_json( + GCS_API_BASE_URL, params=params, headers=headers) + items = response_data.get("items", []) + all_items.extend(items) + + next_page_token = response_data.get("nextPageToken") + if not next_page_token: + break + except Exception as e: + logger.debug(f"PR #{pr_number} not found in {repo} or error occurred: {e}") + break + + logger.info(f"Found {len(all_items)} files matching {glob_pattern}") + return all_items + + +def extract_build_components(path: str) -> tuple[str, str, str, str]: + """Extract build components using Network Operator regex pattern. + + Override: Uses TEST_RESULT_PATH_REGEX defined above for NNO paths. + + Args: + path: File path or URL + + Returns: + Tuple of (repo, pr_number, job_name, build_id) + + Raises: + ValueError: If path doesn't match expected pattern + """ + original_path = path + if '/artifacts/' in path: + path = path.split('/artifacts/')[0] + '/' + + match = TEST_RESULT_PATH_REGEX.search(path) + if not match: + msg = "Network operator path regex mismatch" if "nvidia-network-operator" in original_path else "Unexpected path format" + raise ValueError(msg) + + repo = match.group("repo") + pr_number = match.group("pr_number") + job_name = match.group("job_name") + build_id = match.group("build_id") + + return (repo, pr_number, job_name, build_id) + + +def build_files_lookup( + finished_files: List[Dict[str, Any]], + ocp_version_files: List[Dict[str, Any]], + network_version_files: List[Dict[str, Any]] +) -> tuple[Dict[tuple[str, str, str], Dict[str, Dict[str, Any]]], set[tuple[str, str, str]]]: + """Build a single lookup dictionary mapping build keys to all their related files. + + Override: Uses our extract_build_components with NNO regex. + + Returns a dictionary where each key (pr_number, job_name, build_id) maps to a structure containing + all related files: {finished: file, ocp: file, network: file} + """ + build_files = {} + all_builds = set() + + + all_files_with_type = [] + for file_item in finished_files: + all_files_with_type.append((file_item, 'finished')) + for file_item in ocp_version_files: + all_files_with_type.append((file_item, 'ocp')) + for file_item in network_version_files: + all_files_with_type.append((file_item, 'network')) + + for file_item, file_type in all_files_with_type: + path = file_item.get("name", "") + + try: + repo, pr_number, job_name, build_id = extract_build_components(path) + except ValueError: + continue + + if build_id in ['latest-build.txt', 'latest-build']: + continue + + key = (pr_number, job_name, build_id) + + if key not in build_files: + build_files[key] = {} + + build_files[key][file_type] = file_item + all_builds.add(key) + + return build_files, all_builds + + +def process_single_build( + pr_number_arg: str, + job_name: str, + build_id: str, + ocp_version: str, + network_suffix: str, + build_files: Dict[tuple[str, str, str], Dict[str, Dict[str, Any]]], + dual_builds_info: Optional[Dict[tuple[str, str, str], Dict[str, Dict[str, Any]]]] = None +) -> TestResult: + """Process a single build and return its test result. + + Override: Uses 'network' key instead of 'gpu' for version files. + """ + key = (pr_number_arg, job_name, build_id) + build_file_set = build_files[key] + + finished_file = build_file_set['finished'] + finished_content = fetch_gcs_file_content(finished_file['name']) + finished_data = json.loads(finished_content) + status = finished_data["result"] + timestamp = finished_data["timestamp"] + + if dual_builds_info and key in dual_builds_info: + dual_files = dual_builds_info[key] + if 'nested' in dual_files and 'top_level' in dual_files: + nested_content = fetch_gcs_file_content(dual_files['nested']['name']) + nested_data = json.loads(nested_content) + nested_status = nested_data["result"] + + top_level_content = fetch_gcs_file_content(dual_files['top_level']['name']) + top_level_data = json.loads(top_level_content) + top_level_status = top_level_data["result"] + + if nested_status == STATUS_SUCCESS and top_level_status != STATUS_SUCCESS: + logger.warning( + f"Build {build_id}: Network operator tests SUCCEEDED but overall build has finished with status {top_level_status}." + ) + + job_url = build_prow_job_url(finished_file['name']) + + logger.info(f"Built prow job URL for build {build_id} from path {finished_file['name']}: {job_url}") + + ocp_version_file = build_file_set.get('ocp') + network_version_file = build_file_set.get('network') + + if ocp_version_file and network_version_file: + exact_ocp = fetch_gcs_file_content(ocp_version_file['name']).strip() + exact_network_version = fetch_gcs_file_content( + network_version_file['name']).strip() + logger.info(f"Found exact versions for build {build_id}: OCP={exact_ocp}, Network={exact_network_version}") + result = TestResult(exact_ocp, exact_network_version, + status, job_url, timestamp) + else: + # Use base versions + logger.info(f"No exact versions found for build {build_id}, using base versions") + result = TestResult(ocp_version, network_suffix, + status, job_url, timestamp) + + return result + + +def fetch_pr_files(pr_number: str) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """Fetch all required file types for a PR using targeted filtering. + + Override: Uses network-operator-e2e artifact paths instead of gpu-operator-e2e. + """ + logger.info(f"Fetching files for PR #{pr_number}") + + all_finished_files = fetch_filtered_files(pr_number, "**/finished.json") + ocp_version_files = fetch_filtered_files( + pr_number, "**/ocp.version") + network_version_files = fetch_filtered_files( + pr_number, "**/operator.version") + + return all_finished_files, ocp_version_files, network_version_files + + +def filter_network_finished_files(all_finished_files: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], Dict[tuple[str, str, str], Dict[str, Dict[str, Any]]]]: + """Filter Network operator E2E finished.json files, preferring nested when available. + + Override: Checks for nvidia-network-operator instead of nvidia-gpu-operator. + """ + preferred_files = {} + all_build_files = {} + + logger.info(f"Filtering {len(all_finished_files)} finished files for network operator") + + for file_item in all_finished_files: + path = file_item.get("name", "") + + + if not ("nvidia-network-operator" in path and path.endswith('/finished.json')): + continue + + logger.debug(f"Found network operator file: {path[-80:]}") + + + is_nested = '/artifacts/nvidia-network-operator-' in path and '/network-operator-e2e/finished.json' in path + is_top_level = not is_nested and '/artifacts/' not in path + + logger.debug(f" is_nested={is_nested}, is_top_level={is_top_level}") + + if not (is_nested or is_top_level): + logger.debug(f" Skipping - not nested or top-level") + continue + + try: + repo, pr_number, job_name, build_id = extract_build_components(path) + build_key = (pr_number, job_name, build_id) + except ValueError: + continue + + + if build_key not in all_build_files: + all_build_files[build_key] = {} + + if is_nested: + all_build_files[build_key]['nested'] = file_item + else: + all_build_files[build_key]['top_level'] = file_item + + + if build_key not in preferred_files or is_nested: + preferred_files[build_key] = (file_item, is_nested) + + + result = [file_item for file_item, _ in preferred_files.values()] + dual_builds = {k: v for k, v in all_build_files.items() + if 'nested' in v and 'top_level' in v} + + return result, dual_builds + + +def extract_test_flavor_from_job_name(job_name: str) -> str: + + job_lower = job_name.lower() + + # Extract infrastructure type + infrastructure = None + if "doca4" in job_lower and "bare-metal" not in job_lower: + infrastructure = "DOCA4" + elif "bare-metal" in job_lower: + infrastructure = "Bare Metal" + elif "hosted" in job_lower: + infrastructure = "Hosted" + + # Extract RDMA type (most specific first) + rdma_type = None + if "legacy-sriov-rdma" in job_lower or "rdma-legacy-sriov" in job_lower: + rdma_type = "RDMA Legacy SR-IOV" + elif "shared-device-rdma" in job_lower or "rdma-shared-dev" in job_lower: + rdma_type = "RDMA Shared Device" + elif "sriov" in job_lower and "rdma" in job_lower: + rdma_type = "RDMA SR-IOV" + elif "rdma" in job_lower: + rdma_type = "RDMA" + + # Extract test type (if not RDMA) + test_type = None + if not rdma_type: + if "bare-metal-e2e" in job_lower: + test_type = "E2E" + elif "nvidia-network-operator-e2e" in job_lower or "-e2e" in job_lower: + test_type = "E2E" + + # Check for GPU involvement + has_gpu = False + if "gpu" in job_lower or "gpudirect" in job_lower: + has_gpu = True + + # Build the flavor description + parts = [] + + # Add infrastructure + if infrastructure: + parts.append(infrastructure) + + # Add test type or RDMA type (with GPU qualifier if applicable) + if rdma_type: + if has_gpu: + parts.append(f"{rdma_type} with GPU") + else: + parts.append(rdma_type) + elif test_type: + if has_gpu: + parts.append(f"{test_type} with GPU") + else: + parts.append(test_type) + elif has_gpu: + # GPU mentioned but no specific test type + parts.append("with GPU") + + # If nothing was identified, return a generic label + if not parts: + if infrastructure: + return infrastructure + return "Standard" + + return " - ".join(parts) + + +def process_tests_for_pr(pr_number: str, results_by_ocp: Dict[str, Dict[str, Any]]) -> None: + """Retrieve and store test results for all jobs under a single PR. + + Override: Uses network operator specific filtering and naming. + """ + logger.info(f"Fetching test data for PR #{pr_number}") + + + all_finished_files, ocp_version_files, network_version_files = fetch_pr_files(pr_number) + + + finished_files, dual_builds_info = filter_network_finished_files(all_finished_files) + logger.info(f"After filtering, got {len(finished_files)} finished files") + + + build_files, all_builds = build_files_lookup( + finished_files, ocp_version_files, network_version_files) + + logger.info(f"Found {len(all_builds)} builds to process") + + + processed_count = 0 + + for pr_num, job_name, build_id in sorted(all_builds): + + if job_name.startswith("rehearse-"): + repo = "openshift_release" + else: + repo = "rh-ecosystem-edge_nvidia-ci" + + + job_path = f"pr-logs/pull/{repo}/{pr_num}/{job_name}/" + full_path = f"{job_path}{build_id}" + match = TEST_RESULT_PATH_REGEX.search(full_path) + if not match: + logger.warning(f"Could not parse versions from components: {pr_num}, {job_name}, {build_id}") + continue + ocp_version = match.group("ocp_version") + network_suffix = "network-operator" + + logger.info( + f"Processing build {build_id} for {ocp_version} + {network_suffix}") + + result = process_single_build( + pr_num, job_name, build_id, ocp_version, network_suffix, build_files, dual_builds_info) + + # Extract test flavor from job name + test_flavor = extract_test_flavor_from_job_name(job_name) + + + actual_ocp_version = result.ocp_full_version if result.has_exact_versions() else None + if not actual_ocp_version: + continue + # Convert infrastructure types like "doca4", "bare-metal" to actual OCP version if they were used as version + # This handles legacy data where infrastructure type was mistakenly used as OCP version + if actual_ocp_version in ["doca4", "bare-metal", "hosted"]: + logger.warning(f"Found infrastructure type '{actual_ocp_version}' as OCP version in job {job_name}") + # Try to get it from the result's ocp_full_version + if hasattr(result, 'ocp_full_version') and result.ocp_full_version and result.ocp_full_version not in ["doca4", "bare-metal", "hosted"]: + actual_ocp_version = result.ocp_full_version + logger.info(f" -> Resolved to OCP version: {actual_ocp_version}") + else: + # Use the version from job name if it's not an infrastructure type + if ocp_version not in ["doca4", "bare-metal", "hosted"]: + actual_ocp_version = ocp_version + logger.info(f" -> Using OCP version from path: {actual_ocp_version}") + else: + # Skip this result if we can't determine a valid OCP version + logger.warning(f" -> Skipping result - cannot determine valid OCP version for job {job_name}") + continue + + # Validate that we have a proper OCP version (should start with digit and contain dots) + if not actual_ocp_version or not actual_ocp_version[0].isdigit() or '.' not in actual_ocp_version: + logger.warning(f"Invalid OCP version '{actual_ocp_version}' for job {job_name}, skipping") + continue + + # Initialize OCP version entry if needed + results_by_ocp.setdefault(actual_ocp_version, { + "bundle_tests": [], + "release_tests": [], + "job_history_links": set(), + "test_flavors": {} + }) + + # Initialize test flavor entry if needed + if test_flavor not in results_by_ocp[actual_ocp_version]["test_flavors"]: + results_by_ocp[actual_ocp_version]["test_flavors"][test_flavor] = { + "results": [], + "job_history_links": set() + } + + + job_history_url = f"https://prow.ci.openshift.org/job-history/gs/test-platform-results/pr-logs/directory/{job_name}" + results_by_ocp[actual_ocp_version]["job_history_links"].add(job_history_url) + results_by_ocp[actual_ocp_version]["test_flavors"][test_flavor]["job_history_links"].add(job_history_url) + + # Store result with test flavor information + result_dict = result.to_dict() + result_dict["test_flavor"] = test_flavor + + + if job_name.endswith('-master'): + results_by_ocp[actual_ocp_version]["bundle_tests"].append(result_dict) + else: + + if result.has_exact_versions() and result.test_status != STATUS_ABORTED: + results_by_ocp[actual_ocp_version]["release_tests"].append(result_dict) + results_by_ocp[actual_ocp_version]["test_flavors"][test_flavor]["results"].append(result_dict) + else: + logger.debug(f"Excluded release test for build {build_id}: status={result.test_status}, exact_versions={result.has_exact_versions()}") + + processed_count += 1 + + logger.info(f"Processed {processed_count} builds for PR #{pr_number}") + + +def main() -> None: + """Main entry point for Network Operator CI data fetcher.""" + parser = argparse.ArgumentParser(description="Network Operator Test Matrix Utility") + parser.add_argument("--pr_number", default="all", + help="PR number to process; use 'all' for full history") + parser.add_argument("--baseline_data_filepath", required=True, + help="Path to the baseline data file") + parser.add_argument("--merged_data_filepath", required=True, + help="Path to the updated (merged) data file") + parser.add_argument("--bundle_result_limit", type=int_or_none, default=None, + help="Number of latest bundle results (jobs ending with '-master') to keep per version. Non-bundle results are kept without limit. Omit or use 'unlimited' for no limit. (default: unlimited)") + args = parser.parse_args() + + + with open(args.baseline_data_filepath, "r") as f: + existing_results: Dict[str, Dict[str, Any]] = json.load(f) + logger.info(f"Loaded baseline data with {len(existing_results)} OCP versions") + + local_results: Dict[str, Dict[str, List[Dict[str, Any]]]] = {} + if args.pr_number.lower() == "all": + process_closed_prs(local_results) + else: + process_tests_for_pr(args.pr_number, local_results) + merge_and_save_results( + local_results, args.merged_data_filepath, existing_results=existing_results, bundle_result_limit=args.bundle_result_limit) + + +if __name__ == "__main__": + main() diff --git a/workflows/nno_dashboard/generate_ci_dashboard.py b/workflows/nno_dashboard/generate_ci_dashboard.py new file mode 100644 index 00000000..4016a27f --- /dev/null +++ b/workflows/nno_dashboard/generate_ci_dashboard.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python +""" +NVIDIA Network Operator CI Dashboard Generator + +This module extends the GPU Operator CI dashboard generator with Network Operator specific imports. +It reuses all the core logic from the GPU operator dashboard and only overrides the import +for the version field names and template loading. +""" +import argparse +import json +import os +from datetime import datetime, timezone +from typing import Any, Dict, List + +from workflows.common.utils import logger +from workflows.common.templates import load_template + +# Import helper functions from GPU operator dashboard (reuse everything) +from workflows.gpu_operator_dashboard.generate_ci_dashboard import ( + has_valid_semantic_versions, + build_catalog_table_rows, + build_notes, + build_toc, + build_bundle_info, +) + +# Override: Import network operator specific constants +from workflows.nno_dashboard.fetch_ci_data import ( + OCP_FULL_VERSION, + NETWORK_OPERATOR_VERSION as GPU_OPERATOR_VERSION, # Alias for compatibility + STATUS_ABORTED, +) + +# Note: We're aliasing NETWORK_OPERATOR_VERSION as GPU_OPERATOR_VERSION so that +# all the imported functions from GPU operator dashboard work without modification. +# The functions just reference the field name, they don't care about the actual operator type. + + +def is_valid_ocp_version(version_key: str) -> bool: + """ + Check if a version key is a valid OpenShift version. + + Valid: "4.17.16", "4.16", "4.15.0" + Invalid: "doca4", "bare-metal", "hosted", "Unknown" + """ + # Filter out known infrastructure types + invalid_keys = ["doca4", "bare-metal", "hosted", "unknown"] + if version_key.lower() in invalid_keys: + return False + + # Valid OCP versions start with a digit and contain dots + if not version_key or not version_key[0].isdigit(): + return False + + # Check if it looks like a semantic version (X.Y or X.Y.Z) + parts = version_key.split('.') + if len(parts) < 2: + return False + + try: + # Try to parse first two parts as numbers + int(parts[0]) + int(parts[1]) + return True + except (ValueError, IndexError): + return False + + +def build_test_flavors_sections(ocp_key: str, test_flavors: Dict[str, Dict[str, Any]], templates_dir: str) -> str: + """ + Build HTML sections for each test flavor. + + Args: + ocp_key: OpenShift version key + test_flavors: Dictionary of test flavors with their results + templates_dir: Path to templates directory + + Returns: + HTML string with all test flavor sections + """ + if not test_flavors: + return "" + + test_flavor_template = load_template("test_flavor_section.html", templates_dir) + html_sections = [] + + # Sort test flavors for consistent display + sorted_flavors = sorted(test_flavors.keys()) + + for flavor in sorted_flavors: + flavor_data = test_flavors[flavor] + results = flavor_data.get("results", []) + + if not results: + continue + + # Build table rows for this flavor + flavor_rows_html = build_catalog_table_rows(results) + + # Create a safe ID from the flavor name + flavor_id = flavor.lower().replace(" ", "-").replace("/", "-") + + # Fill in the template + flavor_section = test_flavor_template + flavor_section = flavor_section.replace("{test_flavor}", flavor) + flavor_section = flavor_section.replace("{ocp_key}", ocp_key) + flavor_section = flavor_section.replace("{flavor_id}", flavor_id) + flavor_section = flavor_section.replace("{flavor_table_rows}", flavor_rows_html) + + html_sections.append(flavor_section) + + return "\n".join(html_sections) + + +def generate_test_matrix(ocp_data: Dict[str, Dict[str, Any]]) -> str: + """ + Build the final HTML report by: + 1. Reading the header template, + 2. Generating the table blocks for each OCP version, + 3. Reading the footer template and injecting the last-updated time. + + This is a Network Operator specific version that loads templates from the NNO templates directory. + """ + # Load templates from NNO templates directory + templates_dir = os.path.join(os.path.dirname(__file__), "templates") + header_template = load_template("header.html", templates_dir) + html_content = header_template + main_table_template = load_template("main_table.html", templates_dir) + + # Filter to only valid OCP versions (exclude infrastructure types like "doca4", "bare-metal") + valid_ocp_keys = [key for key in ocp_data.keys() if is_valid_ocp_version(key)] + sorted_ocp_keys = sorted(valid_ocp_keys, reverse=True) + + logger.info(f"Valid OCP versions found: {sorted_ocp_keys}") + if len(valid_ocp_keys) != len(ocp_data.keys()): + filtered_keys = set(ocp_data.keys()) - set(valid_ocp_keys) + logger.warning(f"Filtered out non-OCP version keys: {filtered_keys}") + + html_content += build_toc(sorted_ocp_keys) + + for ocp_key in sorted_ocp_keys: + notes = ocp_data[ocp_key].get("notes", []) + bundle_results = ocp_data[ocp_key].get("bundle_tests", []) + release_results = ocp_data[ocp_key].get("release_tests", []) + test_flavors = ocp_data[ocp_key].get("test_flavors", {}) + + # Apply additional filtering for release results (defensive programming) + # Note: release_tests should already be pre-filtered, but we keep this for safety + regular_results = [] + for r in release_results: + # Only include entries with valid semantic versions + # Ignore ABORTED results for regular (non-bundle) results + if has_valid_semantic_versions(r) and r.get("test_status") != STATUS_ABORTED: + regular_results.append(r) + + notes_html = build_notes(notes) + bundle_info_html = build_bundle_info(bundle_results) + + # Build test flavor sections + test_flavors_html = build_test_flavors_sections(ocp_key, test_flavors, templates_dir) + + # If no test flavors, fall back to showing all release results in a single table + if not test_flavors_html and regular_results: + fallback_section = load_template("test_flavor_section.html", templates_dir) + table_rows_html = build_catalog_table_rows(regular_results) + fallback_section = fallback_section.replace("{test_flavor}", "From operator catalog") + fallback_section = fallback_section.replace("{ocp_key}", ocp_key) + fallback_section = fallback_section.replace("{flavor_id}", "regular") + fallback_section = fallback_section.replace("{flavor_table_rows}", table_rows_html) + test_flavors_html = fallback_section + + table_block = main_table_template + table_block = table_block.replace("{ocp_key}", ocp_key) + table_block = table_block.replace("{test_flavors_sections}", test_flavors_html) + table_block = table_block.replace("{bundle_info}", bundle_info_html) + table_block = table_block.replace("{notes}", notes_html) + html_content += table_block + + footer_template = load_template("footer.html", templates_dir) + now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + footer_template = footer_template.replace("{LAST_UPDATED}", now_str) + html_content += footer_template + return html_content + + +def main(): + """Main entry point for Network Operator dashboard generator.""" + parser = argparse.ArgumentParser(description="Network Operator Test Matrix Dashboard Generator") + parser.add_argument("--dashboard_html_filepath", required=True, + help="Path to html file for the dashboard") + parser.add_argument("--dashboard_data_filepath", required=True, + help="Path to the file containing the versions for the dashboard") + args = parser.parse_args() + + with open(args.dashboard_data_filepath, "r") as f: + ocp_data = json.load(f) + logger.info( + f"Loaded JSON data with keys: {list(ocp_data.keys())} from {args.dashboard_data_filepath}") + + html_content = generate_test_matrix(ocp_data) + + with open(args.dashboard_html_filepath, "w", encoding="utf-8") as f: + f.write(html_content) + logger.info( + f"Network Operator dashboard generated: {args.dashboard_html_filepath}") + + +if __name__ == "__main__": + main() + diff --git a/workflows/nno_dashboard/requirements.txt b/workflows/nno_dashboard/requirements.txt new file mode 100644 index 00000000..bad006c0 --- /dev/null +++ b/workflows/nno_dashboard/requirements.txt @@ -0,0 +1,12 @@ +annotated-types==0.7.0 +certifi==2025.1.31 +charset-normalizer==3.4.1 +idna==3.10 +pydantic==2.11.1 +pydantic_core==2.33.0 +requests==2.32.4 +semver==3.0.4 +typing-inspection==0.4.0 +typing_extensions==4.13.0 +urllib3==2.5.0 + diff --git a/workflows/nno_dashboard/templates/footer.html b/workflows/nno_dashboard/templates/footer.html new file mode 100644 index 00000000..41f3025a --- /dev/null +++ b/workflows/nno_dashboard/templates/footer.html @@ -0,0 +1,6 @@ +
+ Last updated: {LAST_UPDATED} +
+ + + diff --git a/workflows/nno_dashboard/templates/header.html b/workflows/nno_dashboard/templates/header.html new file mode 100644 index 00000000..e022d438 --- /dev/null +++ b/workflows/nno_dashboard/templates/header.html @@ -0,0 +1,37 @@ + + + + + + + Test Matrix: NVIDIA Network Operator on Red Hat OpenShift + + + + +

Test Matrix: NVIDIA Network Operator on Red Hat OpenShift

+ + + diff --git a/workflows/nno_dashboard/templates/main_table.html b/workflows/nno_dashboard/templates/main_table.html new file mode 100644 index 00000000..66607705 --- /dev/null +++ b/workflows/nno_dashboard/templates/main_table.html @@ -0,0 +1,10 @@ + +
+
+ OpenShift {ocp_key} +
+ {notes} + {test_flavors_sections} + {bundle_info} +
+ diff --git a/workflows/nno_dashboard/templates/test_flavor_section.html b/workflows/nno_dashboard/templates/test_flavor_section.html new file mode 100644 index 00000000..f4fe7c64 --- /dev/null +++ b/workflows/nno_dashboard/templates/test_flavor_section.html @@ -0,0 +1,15 @@ +
+ + + + + + + + + + {flavor_table_rows} + +
OpenShiftNVIDIA Network Operator
+
+