diff --git a/.github/workflows/integration-gcp.yml b/.github/workflows/integration-gcp.yml new file mode 100644 index 000000000..68dfdee53 --- /dev/null +++ b/.github/workflows/integration-gcp.yml @@ -0,0 +1,123 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Integration Tests - GCP + +on: + workflow_dispatch: {} # allow manual runs for testing + push: + branches: + - main + - feature/oidc-gcp + +permissions: + contents: read + actions: read + id-token: write + +jobs: + copy-images: + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + IMAGE_TAG: main-ddc3fc4 + TARGET_REG: us-docker.pkg.dev + TARGET_REPO: nvsentinel + CRANE_VERSION: "0.20.6" + IDENTITY_PROVIDER: "projects/868575635057/locations/global/workloadIdentityPools/github-pool/providers/github-provider" + SERVICE_ACCOUNT: "github-actions-user@proj-dgxc-nvsentinel.iam.gserviceaccount.com" + PROJECT_ID: "proj-dgxc-nvsentinel" + steps: + # Checkout Repo + - name: Checkout + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + + # Configure GCP AuthN + - name: Get AuthN Token + id: auth + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3 + with: + token_format: access_token + workload_identity_provider: ${{ env.IDENTITY_PROVIDER }} + service_account: ${{ env.SERVICE_ACCOUNT }} + + # Copy Images to GCP Artifact Registry + - name: Authenticate to GCP Artifact Registry + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 + with: + registry: ${{ env.TARGET_REG }} + username: oauth2accesstoken + password: ${{ steps.auth.outputs.access_token }} + + - name: Install crane + shell: bash + env: + CRANE_VERSION: ${{ env.CRANE_VERSION }} + REPO_URL: "https://github.com/google/go-containerregistry" + run: | + set -euo pipefail + URL="$REPO_URL/releases/download/v${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" + curl -sSL "$URL" | sudo tar -xz -C /usr/local/bin crane + crane version + + - name: Auth crane Source + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | crane auth login ghcr.io --username=${{ github.actor }} --password-stdin + + - name: Auth crane Target + run: | + echo "${{ steps.auth.outputs.access_token }}" | crane auth login ${{ env.TARGET_REG }} --username=oauth2accesstoken --password-stdin + + - name: Build Image List + shell: bash + env: + CI_COMMIT_REF_NAME: ${{ env.IMAGE_TAG }} + run: | + scripts/build-image-list.sh + cat versions.txt + + - name: Copy Images to GCP Artifact Registry + shell: bash + env: + TARGET_REG: "${{ env.TARGET_REG }}/${{ env.PROJECT_ID }}/${{ env.TARGET_REPO }}" + run: | + scripts/copy-images.sh "$TARGET_REG" versions.txt + + # Create GKE Cluster + - name: Setup gcloud CLI + uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db # v3.0.1 + with: + version: '>= 543.0.0' + + - name: Show gcloud CLI Info + run: | + gcloud info + + - name: Create Cluster + id: create-cluster + shell: bash + env: + TARGET_REG: "${{ env.TARGET_REG }}/${{ env.PROJECT_ID }}/${{ env.TARGET_REPO }}" + run: | + scripts/gcp-cluster-up.sh + + # TODO: Add integration tests here that use the cluster + + - name: Destroy Cluster + if: always() && steps.create-cluster.outcome != 'skipped' + shell: bash + env: + TARGET_REG: "${{ env.TARGET_REG }}/${{ env.PROJECT_ID }}/${{ env.TARGET_REPO }}" + run: | + scripts/gcp-cluster-down.sh \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5916b560a..30a0ed0d6 100644 --- a/.gitignore +++ b/.gitignore @@ -434,3 +434,6 @@ health-monitors/syslog-health-monitor/syslog-health-monitor labeler/labeler node-drainer/node-drainer platform-connectors/platform-connectors + +# Ignore generated credentials from google-github-actions/auth +gha-creds-*.json \ No newline at end of file diff --git a/scripts/copy-images.sh b/scripts/copy-images.sh new file mode 100755 index 000000000..b6d5e3cde --- /dev/null +++ b/scripts/copy-images.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -euo pipefail + +# Variables +TARGET_REG_URI="${1:-}" +IMAGE_LIST_FILE="${2:-versions.txt}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Helper functions +log_info() { + echo -e "${BLUE}ℹ️ $*${NC}" +} + +log_success() { + echo -e "${GREEN}✅ $*${NC}" +} + +log_warning() { + echo -e "${YELLOW}⚠️ $*${NC}" +} + +log_error() { + echo -e "${RED}❌ $*${NC}" +} + +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Validate prerequisites +if ! command_exists crane; then + log_error "crane is not installed. Please install crane to proceed." + exit 1 +fi + +# Validate arguments +if [ -z "$TARGET_REG_URI" ]; then + log_error "Usage: $0 [image-list-file]" + log_error "Example: $0 us-docker.pkg.dev/my-project/my-repo versions.txt" + exit 1 +fi + +if [ ! -f "$IMAGE_LIST_FILE" ]; then + log_error "Image list file not found: $IMAGE_LIST_FILE" + exit 1 +fi + +# Info +log_info "Source image list: $IMAGE_LIST_FILE" +log_info "Target registry URI: $TARGET_REG_URI" +log_info "Reading images from $IMAGE_LIST_FILE..." + +# Count total images (excluding empty lines and comments) +TOTAL_IMAGES=$(grep -v '^#' "$IMAGE_LIST_FILE" | grep -v '^[[:space:]]*$' | wc -l | tr -d '[:space:]') +log_info "Found $TOTAL_IMAGES images to copy" + +# Counters +SUCCESS_COUNT=0 +FAILURE_COUNT=0 +SKIPPED_COUNT=0 + +# Copy single image function +copy_image() { + local src_image_uri=$1 + local image_num=$2 + + log_info "[$image_num/$TOTAL_IMAGES] Processing: $src_image_uri" + + # Extract image name and tag from URI + # Format: registry/org/image:tag + local image_base=$(echo "$src_image_uri" | sed -E 's|^(.*/)([^/]+):(.*)$|\2|') + local image_tag=$(echo "$src_image_uri" | sed -E 's|^.*:(.*)$|\1|') + + # Build target URI + local target_uri="$TARGET_REG_URI/$image_base:$image_tag" + + log_info " Source: $src_image_uri" + log_info " Target: $target_uri" + + # Get source digest + local src_digest + if ! src_digest=$(crane digest "$src_image_uri" 2>&1); then + log_error " Failed to get digest for $src_image_uri: $src_digest" + return 1 + fi + + log_info " Source digest: $src_digest" + + # Check if image already exists at target with same digest + local target_digest + if target_digest=$(crane digest "$target_uri" 2>/dev/null); then + if [ "$target_digest" = "$src_digest" ]; then + log_warning " Image already exists at target with same digest, skipping" + return 2 + else + log_info " Image exists but digest differs, will overwrite" + fi + fi + + # Copy image + log_info " Copying image..." + if ! crane copy "$src_image_uri" "$target_uri"; then + log_error " Failed to copy image" + return 1 + fi + + # Verify digest after copy + local new_digest + if ! new_digest=$(crane digest "$target_uri" 2>&1); then + log_error " Failed to verify target digest: $new_digest" + return 1 + fi + + if [ "$new_digest" != "$src_digest" ]; then + log_error " Digest mismatch! Source: $src_digest, Target: $new_digest" + return 1 + fi + + log_success " Successfully copied and verified: $target_uri" + return 0 +} + +# Process each image in the list +IMAGE_NUM=0 +while IFS= read -r src_image_uri; do + # Skip empty lines and comments + [[ -z "$src_image_uri" || "$src_image_uri" =~ ^[[:space:]]*# ]] && continue + + IMAGE_NUM=$((IMAGE_NUM + 1)) + + if copy_image "$src_image_uri" "$IMAGE_NUM"; then + SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) + elif [ $? -eq 2 ]; then + SKIPPED_COUNT=$((SKIPPED_COUNT + 1)) + else + FAILURE_COUNT=$((FAILURE_COUNT + 1)) + log_warning "Continuing with next image..." + fi + + echo "" # Blank line between images +done < "$IMAGE_LIST_FILE" + +# Summary +echo "==================================================" +log_info "Image Copy Summary" +echo "==================================================" +log_success "Successfully copied: $SUCCESS_COUNT" +log_warning "Skipped (already exist): $SKIPPED_COUNT" +if [ $FAILURE_COUNT -gt 0 ]; then + log_error "Failed: $FAILURE_COUNT" +else + log_info "Failed: $FAILURE_COUNT" +fi +log_info "Total processed: $TOTAL_IMAGES" +echo "==================================================" + +# Exit with error if any failures +if [ $FAILURE_COUNT -gt 0 ]; then + exit 1 +fi + +exit 0 diff --git a/scripts/gcp-cluster-down.sh b/scripts/gcp-cluster-down.sh new file mode 100755 index 000000000..fe2a36265 --- /dev/null +++ b/scripts/gcp-cluster-down.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -euo pipefail + +DIR="$(dirname "$0")" +. "${DIR}/gcp-cluster-env.sh" + +echo "Deleting GKE cluster: $CLUSTER_NAME in region $REGION" + +# Delete regional GKE cluster +gcloud container clusters delete "$CLUSTER_NAME" \ + --region="$REGION" \ + --quiet + +echo "✅ Cluster deletion complete!" \ No newline at end of file diff --git a/scripts/gcp-cluster-env.sh b/scripts/gcp-cluster-env.sh new file mode 100755 index 000000000..8ee0e4c64 --- /dev/null +++ b/scripts/gcp-cluster-env.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -euo pipefail + +# validation +gcloud=$(which gcloud) || ( echo "gcloud not found" && exit 1 ) + +# Check gcloud is authenticated. +ACCOUNT=$(gcloud auth list --filter=status:ACTIVE --format="value(account)") +if [[ -z "${ACCOUNT}" ]]; then + echo "Run 'gcloud auth login' to authenticate to GCP first." + exit 1 +fi; + +# Check project is set +export PROJECT_ID=$(gcloud config list --format 'value(core.project)') +if [[ -z "${PROJECT_ID}" ]]; then + echo "`gcloud config set project YOUR_PROJECT_ID` note set." + exit 1 +fi; + +# Check region is set +export REGION=$(gcloud config list --format 'value(compute.region)') +if [[ -z "${REGION}" ]]; then + echo "Warning: \`gcloud config set compute/region YOUR_REGION\` not set, using default." + export REGION="us-central1" +fi + +# Config +export CLUSTER_NAME="${CLUSTER_NAME:-validation}" +export CLUSTER_CHANNEL="${CLUSTER_CHANNEL:-regular}" +export SYSTEM_NODE_TYPE="${SYSTEM_NODE_TYPE:-e2-standard-4}" +export SYSTEM_NODE_COUNT="${SYSTEM_NODE_COUNT:-3}" + +# SERVICE_ACCOUNT is optional - set by workflow or provide manually +export SERVICE_ACCOUNT="${SERVICE_ACCOUNT:-}" + +# Print variables +cat << EOF + +Configuration: + PROJECT_ID: ${PROJECT_ID} + ACCOUNT: ${ACCOUNT} + REGION: ${REGION} + CLUSTER_NAME: ${CLUSTER_NAME} + CLUSTER_CHANNEL: ${CLUSTER_CHANNEL} + NODE_TYPE: ${SYSTEM_NODE_TYPE} + NODE_COUNT: ${SYSTEM_NODE_COUNT} + SERVICE_ACCOUNT: ${SERVICE_ACCOUNT:-} + +EOF diff --git a/scripts/gcp-cluster-up.sh b/scripts/gcp-cluster-up.sh new file mode 100755 index 000000000..782fac4a0 --- /dev/null +++ b/scripts/gcp-cluster-up.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -euo pipefail + +DIR="$(dirname "$0")" +. "${DIR}/gcp-cluster-env.sh" + +# Assumptions: +# - gcloud is installed and configured +# - OIDC configured (see https://github.com/mchmarny/oidc-for-gcp-using-terraform) + + +# Check if default network exists, create if missing +echo "Checking for VPC network..." +if ! gcloud compute networks describe default --format="value(name)" >/dev/null 2>&1; then + echo "Creating default VPC network..." + gcloud compute networks create default --subnet-mode=auto + echo "✅ Default network created" +fi + +# Create regional cluster +echo "Creating GKE cluster..." +gcloud container clusters create "$CLUSTER_NAME" \ + --scopes=cloud-platform \ + --disk-size="200" \ + --disk-type="pd-standard" \ + --enable-image-streaming \ + --enable-ip-alias \ + --enable-shielded-nodes \ + --enable-autorepair \ + --enable-network-policy \ + --image-type="COS_CONTAINERD" \ + --labels=source=github,environment=validation \ + --logging=SYSTEM,WORKLOAD \ + --machine-type="$SYSTEM_NODE_TYPE" \ + --monitoring=SYSTEM \ + --num-nodes="$SYSTEM_NODE_COUNT" \ + --region="$REGION" \ + --release-channel="$CLUSTER_CHANNEL" \ + --workload-metadata="GKE_METADATA" \ + --workload-pool="${PROJECT_ID}.svc.id.goog" \ + --addons=HttpLoadBalancing,HorizontalPodAutoscaling + +# Get cluster version +echo "Cluster version:" +gcloud container clusters describe "$CLUSTER_NAME" \ + --region="$REGION" \ + --format="value(currentMasterVersion)" + +# Create policy binding between service account and k8s service account (optional) +if [[ -n "${SERVICE_ACCOUNT}" ]]; then + echo "Creating IAM policy binding for service account..." + gcloud iam service-accounts add-iam-policy-binding "${SERVICE_ACCOUNT}" \ + --member="serviceAccount:${PROJECT_ID}.svc.id.goog[cnrm-system/cnrm-controller-manager]" \ + --role="roles/iam.workloadIdentityUser" +fi + +# Get cluster credentials +echo "Getting cluster credentials..." +gcloud container clusters get-credentials "$CLUSTER_NAME" --region="$REGION" + +echo "✅ Cluster creation complete!"