diff --git a/.github/scripts/cleanups-on-ci-failure.sh b/.github/scripts/cleanups-on-ci-failure.sh new file mode 100755 index 00000000..3e645cf6 --- /dev/null +++ b/.github/scripts/cleanups-on-ci-failure.sh @@ -0,0 +1,324 @@ +#!/bin/bash + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Find the Git repository root directory +find_repo_root() { + local dir="$PWD" + while [[ "$dir" != "/" ]]; do + if [[ -d "$dir/.git" ]]; then + echo "$dir" + return 0 + fi + dir="$(dirname "$dir")" + done + + echo "Error: Could not find repository root. Make sure you're running this script from within a Git repository." + exit 1 +} + +# check if AWS CLI is available +if ! command_exists aws; then + echo "Error: AWS CLI is not installed. Please install it and try again." + exit 1 +fi + +# check if Git is available (for finding repo root) +if ! command_exists git; then + echo "Error: Git is not installed. Please install it and try again." + exit 1 +fi + +# check if Terraform is available +if ! command_exists terraform; then + echo "Error: Terraform is not installed. Please install it and try again." + exit 1 +fi + +# check if kubectl is available +if ! command_exists kubectl; then + echo "Warning: kubectl is not installed - needed for Kubernetes resource cleanup." + exit 1 +fi + +# check if jq is available (for JSON parsing) +if ! command_exists jq; then + echo "Warning: jq is not installed. It's recommended for better error handling." +fi + +# AWS environment variables that might interfere with AWS CLI +AWS_ENV_VARS=( + "AWS_ACCESS_KEY_ID" + "AWS_SECRET_ACCESS_KEY" + "AWS_SESSION_TOKEN" + "AWS_SECURITY_TOKEN" + "AWS_DEFAULT_REGION" +) + +ENV_VARS_SET=false +for var in "${AWS_ENV_VARS[@]}"; do + if [ -n "${!var}" ]; then + echo "Warning: $var is set, which may interfere with AWS SSO login." + ENV_VARS_SET=true + fi +done + +if [ "$ENV_VARS_SET" = true ]; then + echo "Please unset these variables before continuing or they may interfere with the AWS profile." + read -p "Do you want to continue anyway? (y/n): " continue_anyway + if [ "$continue_anyway" != "y" ] && [ "$continue_anyway" != "Y" ]; then + echo "Exiting script. Please unset the environment variables and try again." + exit 1 + fi + echo "Continuing with environment variables set..." +else + echo "No interfering AWS environment variables detected." +fi + +# AWS profile self-hosted and us-west-1 (that's where all the test infra is spun up) +export AWS_PROFILE=self-hosted +echo "AWS profile set to: $AWS_PROFILE" + +# AWS SSO login .... +echo "Performing AWS SSO login..." +aws sso login --sso-session devzero --profile $AWS_PROFILE +if [ $? -ne 0 ]; then + echo "Error: AWS SSO login failed. Please try again." + exit 1 +fi + +# S3 bucket for terraform state +S3_BUCKET="dsh-tf-state" +echo "Using S3 bucket: $S3_BUCKET" + +export AWS_REGION=us-west-1 +echo "AWS region set to: $AWS_REGION" + +# find self-hosted repository root +REPO_ROOT=$(find_repo_root) +echo "Repository root found at: $REPO_ROOT" + +# define terraform state paths relative to repo root +BASE_CLUSTER_PATH="$REPO_ROOT/terraform/examples/aws/base-cluster" +CLUSTER_EXTENSIONS_PATH="$REPO_ROOT/terraform/examples/aws/cluster-extensions" + +# Verify terraform directories exist +if [ ! -d "$BASE_CLUSTER_PATH" ]; then + echo "Error: Base cluster directory '$BASE_CLUSTER_PATH' does not exist." + exit 1 +fi + +if [ ! -d "$CLUSTER_EXTENSIONS_PATH" ]; then + echo "Error: Cluster extensions directory '$CLUSTER_EXTENSIONS_PATH' does not exist." + exit 1 +fi + +echo "Terraform directories verified." + +# check and clean up existing state files +check_and_clean_tfstate() { + local dir="$1" + local files_exist=false + + # Check for any terraform state files + if ls "$dir"/terraform.tfstate* 2>/dev/null || ls "$dir"/.terraform.lock.hcl 2>/dev/null || [ -d "$dir"/.terraform ]; then + echo "Existing Terraform state files found in: $dir" + ls -la "$dir"/terraform.tfstate* "$dir"/.terraform.lock.hcl 2>/dev/null + if [ -d "$dir/.terraform" ]; then + echo "Directory $dir/.terraform exists" + fi + files_exist=true + fi + + if [ "$files_exist" = true ]; then + read -p "Do you want to clean up these files before downloading? (y/n): " cleanup + if [ "$cleanup" = "y" ] || [ "$cleanup" = "Y" ]; then + echo "Removing Terraform state files from $dir" + rm -f "$dir"/terraform.tfstate* + rm -f "$dir"/.terraform.lock.hcl + rm -rf "$dir"/.terraform + echo "Cleanup complete." + else + echo "Warning: Existing files may be overwritten or cause conflicts." + fi + else + echo "No existing Terraform state files found in: $dir" + fi +} + +# Function to configure kubectl for a cluster +configure_kubectl() { + local cluster_name="$1" + + echo "Configuring kubectl for cluster: $cluster_name" + + # Update kubeconfig for the cluster + aws eks update-kubeconfig --name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" + if [ $? -ne 0 ]; then + echo "Warning: Failed to update kubeconfig for cluster $cluster_name" + return 1 + fi + + aws eks create-access-entry --cluster-name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" --principal-arn arn:aws:iam::484907513542:role/aws-reserved/sso.amazonaws.com/us-west-2/AWSReservedSSO_AWSAdministratorAccess_cdb3218a34dc613b --type STANDARD + if [ $? -ne 0 ]; then + echo "Warning: Failed to create access-entry for cluster $cluster_name" + return 1 + fi + + aws eks associate-access-policy --cluster-name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" --principal-arn arn:aws:iam::484907513542:role/aws-reserved/sso.amazonaws.com/us-west-2/AWSReservedSSO_AWSAdministratorAccess_cdb3218a34dc613b --access-scope type=cluster --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy + if [ $? -ne 0 ]; then + echo "Warning: Failed to associate policy with access-entry for cluster $cluster_name" + return 1 + fi + + # Test kubectl connection + echo "Testing kubectl connection..." + kubectl get nodes + if [ $? -ne 0 ]; then + echo "Warning: kubectl cannot connect to cluster $cluster_name" + return 1 + fi + + echo "Successfully configured kubectl for cluster $cluster_name" + return 0 +} + +# Function to run terraform destroy with explicit variable +run_terraform_destroy() { + local dir="$1" + local name="$2" + local cluster_name="$3" + + echo "=== Running terraform destroy in $name ===" + + # Change to the directory and run terraform destroy + cd "$dir" + + # Initialize terraform if needed + echo "Initializing Terraform..." + terraform init + if [ $? -ne 0 ]; then + echo "Error: Terraform init failed in $name." + return 1 + fi + + # Run terraform plan to see what would be destroyed + echo "Running terraform plan with explicit cluster_name=\"$cluster_name\"..." + terraform plan -destroy -var="cluster_name=$cluster_name" + + # Confirm before destroying + read -p "Do you want to proceed with terraform destroy? (y/n): " proceed + if [ "$proceed" != "y" ] && [ "$proceed" != "Y" ]; then + echo "Skipping destroy operation." + return 0 + fi + + # Run terraform destroy with explicit variable + echo "Running terraform destroy with explicit cluster_name=\"$cluster_name\"..." + terraform destroy -auto-approve -var="cluster_name=$cluster_name" + if [ $? -ne 0 ]; then + echo "Error: Terraform destroy failed in $name." + return 1 + fi + + echo "Successfully destroyed resources in $name." + return 0 +} + +# get list of failed job identifiers from user +echo "Enter a comma-separated list of job identifiers to process (check env_var 'JOB_IDENTIFIER' in a failed GitHub Action workflow run; e.g.: 'gh-1-30-al2023-c74f'):" +read -r dir_list + +# convert comma-separated list to array cuz loop +IFS=',' read -ra DIRS <<< "$dir_list" + +# Track overall success +CLEANUP_SUCCESS=true + +# Process each job identifier +for dir in "${DIRS[@]}"; do + # trim whitespace cuz user's input random shit + dir=$(echo "$dir" | xargs) + echo "==========================================================" + echo "Processing job identifier: $dir" + echo "==========================================================" + + # Process cluster-extensions first (child resources) + echo "Working on cluster-extensions..." + + # Check and clean up cluster-extensions directory + echo "Checking cluster-extensions directory for existing state files..." + check_and_clean_tfstate "$CLUSTER_EXTENSIONS_PATH" + + # pull terraform.tfstate for cluster-extensions + echo "Downloading terraform state for $dir/cluster-extensions" + aws s3 cp "s3://$S3_BUCKET/$dir/cluster-extensions/terraform.tfstate" "$CLUSTER_EXTENSIONS_PATH/terraform.tfstate" + if [ $? -ne 0 ]; then + echo "Warning: Failed to download terraform.tfstate for $dir/cluster-extensions" + CLEANUP_SUCCESS=false + else + echo "Successfully downloaded terraform.tfstate for $dir/cluster-extensions" + + # Check and clean up any existing .terraform directory + if [ -d "$CLUSTER_EXTENSIONS_PATH/.terraform" ]; then + echo "Removing existing .terraform directory to ensure clean initialization..." + rm -rf "$CLUSTER_EXTENSIONS_PATH/.terraform" + fi + + # configure kubectl for this cluster + configure_kubectl "$dir" + + # run terraform destroy for cluster-extensions + if ! run_terraform_destroy "$CLUSTER_EXTENSIONS_PATH" "cluster-extensions" "$dir"; then + CLEANUP_SUCCESS=false + fi + fi + + # Now process base-cluster (parent resources) + echo "Working on base-cluster..." + + # Check and clean up base-cluster directory + echo "Checking base-cluster directory for existing state files..." + check_and_clean_tfstate "$BASE_CLUSTER_PATH" + + # pull terraform.tfstate for base-cluster + echo "Downloading terraform state for $dir/base-cluster" + aws s3 cp "s3://$S3_BUCKET/$dir/base-cluster/terraform.tfstate" "$BASE_CLUSTER_PATH/terraform.tfstate" + if [ $? -ne 0 ]; then + echo "Warning: Failed to download terraform.tfstate for $dir/base-cluster" + CLEANUP_SUCCESS=false + else + echo "Successfully downloaded terraform.tfstate for $dir/base-cluster" + + # Check and clean up any existing .terraform directory + if [ -d "$BASE_CLUSTER_PATH/.terraform" ]; then + echo "Removing existing .terraform directory to ensure clean initialization..." + rm -rf "$BASE_CLUSTER_PATH/.terraform" + fi + + # Run terraform destroy for base-cluster + if ! run_terraform_destroy "$BASE_CLUSTER_PATH" "base-cluster" "$dir"; then + CLEANUP_SUCCESS=false + fi + fi + + echo "Completed processing job identifier: $dir" + echo "----------------------------------------------------------" +done + +# Return to the original directory +cd "$REPO_ROOT" + +# Final status report +echo "" +echo "==========================================================" +if [ "$CLEANUP_SUCCESS" = true ]; then + echo "✅ All cleanup operations completed successfully!" +else + echo "⚠️ Some cleanup operations failed. Please check the logs above for details." + echo "You may need to manually inspect and clean up some resources." +fi +echo "==========================================================" diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index dd0b041d..04e1bf12 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -23,9 +23,20 @@ on: jobs: setup-and-test: runs-on: ubuntu-latest + + strategy: + matrix: + # TODO (debo/zvonimir) + # eks_version: ["1.25", "1.30", "1.31"] + eks_version: ["1.30", "1.31"] + fail-fast: false + + name: '(base_image: ${{ github.event.inputs.base_image }}) (eks_version: ${{ matrix.eks_version }})' + permissions: id-token: write contents: read + steps: - name: Checkout Repository uses: actions/checkout@v4 @@ -37,12 +48,6 @@ jobs: aws-region: us-west-1 role-duration-seconds: 7200 - - name: Clone DevZero Self-Hosted Repository - env: - GH_PAT: ${{ secrets.GH_TOKEN }} - run: | - git clone https://$GH_PAT@github.com/devzero-inc/self-hosted.git - - name: Set up Terraform uses: hashicorp/setup-terraform@v3 with: @@ -56,16 +61,21 @@ jobs: - name : Add SHORT_SHA Environment Variable id : short-sha shell: bash - run : echo "SHORT_SHA=`git rev-parse --short HEAD`" >> $GITHUB_ENV + run : | + # creating a 4-char long SHA + echo "SHORT_SHA=`git rev-parse --short=3 HEAD`" >> $GITHUB_ENV - name : Generate unique job identifier id : job-identifier shell: bash - run : echo "JOB_IDENTIFIER=gh-ci-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV + run : | + # replace `.` in k8s version with `-` so that the same job identifier can be used in various places + K8S_VERSION=$(echo ${{ matrix.eks_version }} | sed 's/\./-/') + echo "JOB_IDENTIFIER=gh-${K8S_VERSION}-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV - name: Add Backend Override (Base Cluster) run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster cat < backend_override.tf terraform { backend "s3" { @@ -76,9 +86,23 @@ jobs: } EOF + - name: Set EKS version v${{ matrix.eks_version }} (Base Cluster) + run: | + echo "" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "# Setting eks cluster version" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "cluster_version = \"${{ matrix.eks_version }}\"" >> terraform/examples/aws/base-cluster/terraform.tfvars + + # DevZero currently doesnt publish a base AMI for Kubernetes 1.25, but local testing has indicated that we can use the 1.30 version + # AMIs available https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#Images:visibility=public-images;imageName=:devzero;v=3 + # TODO (debo): this is currently kind of a hack to make sure that the 1.25 test uses the 1.30 node + if [ "${{ matrix.eks_version }}" == "1.25" ]; then + echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "ami_version = \"1.30\"" >> terraform/examples/aws/base-cluster/terraform.tfvars + fi + - name: Initialize and Apply Terraform (Base Cluster) run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster terraform init if [ "${{ github.event.inputs.base_image }}" == "al2023" ]; then terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" @@ -88,7 +112,7 @@ jobs: - name: Update Cluster-Extensions tfvars run: | - cat < self-hosted/terraform/examples/aws/cluster-extensions/terraform.tfvars + cat < terraform/examples/aws/cluster-extensions/terraform.tfvars region = "us-west-1" enable_cluster_autoscaler = false cluster_name = "$JOB_IDENTIFIER" @@ -97,7 +121,7 @@ jobs: - name: Add Backend Override (Cluster Extensions) run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions cat < backend_override.tf terraform { backend "s3" { @@ -110,7 +134,7 @@ jobs: - name: Initialize and Apply Cluster-Extensions run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions terraform init terraform apply -auto-approve @@ -120,7 +144,7 @@ jobs: - name: Deploy Control Plane Dependencies (and modify domains) run: | - cd self-hosted/charts/dz-control-plane-deps + cd charts/dz-control-plane-deps find values -type f -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find values -name "*.bak" -delete make install @@ -132,14 +156,14 @@ jobs: # also setting image.pullsecrets to empty to make sure that each of the deployments dont try to pull their relevant OCI images from this registry # backend license key is ... needed - yq e '.credentials.enable = false | .backend.licenseKey = strenv(BACKEND_LICENSE_KEY) | .image.pullSecrets = []' -i self-hosted/charts/dz-control-plane/values.yaml + yq e '.credentials.enable = false | .backend.licenseKey = strenv(BACKEND_LICENSE_KEY) | .image.pullSecrets = []' -i charts/dz-control-plane/values.yaml - name: Deploy DevZero Control Plane (after configuring kubernetes to use dockerhub creds, and patching all the deployments to point to the right domain) env: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} run: | - cd self-hosted/charts/dz-control-plane + cd charts/dz-control-plane make add-docker-creds find . -name "values.yaml" -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find . -name "values.yaml.bak" -delete make install @@ -148,20 +172,20 @@ jobs: run: | echo -e "\nPods in namespace devzero:" kubectl get pods -n devzero - chmod +x self-hosted/.github/scripts/dsh-pod-test.sh - self-hosted/.github/scripts/dsh-pod-test.sh + chmod +x .github/scripts/dsh-pod-test.sh + .github/scripts/dsh-pod-test.sh echo -e "\nIngress in namespace devzero:" kubectl get ingress -n devzero - name: Deploy Data Plane Dependencies run: | - cd self-hosted/charts/dz-data-plane-deps + cd charts/dz-data-plane-deps find values -type f -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find values -name "*.bak" -delete make install - name: Deploy DevZero Data Plane run: | - cd self-hosted/charts/dz-data-plane + cd charts/dz-data-plane find . -name "values.yaml" -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find . -name "values.yaml.bak" -delete make install @@ -173,37 +197,37 @@ jobs: - name: '[helm] Destroy data-plane' if: always() run: | - cd self-hosted/charts/dz-data-plane + cd charts/dz-data-plane make delete - name: '[helm] Destroy data-plane-deps' if: always() run: | - cd self-hosted/charts/dz-data-plane-deps + cd charts/dz-data-plane-deps make delete - name: '[helm] Destroy control-plane' if: always() run: | - cd self-hosted/charts/dz-control-plane + cd charts/dz-control-plane make delete - name: '[helm] Destroy control-plane-deps' if: always() run: | - cd self-hosted/charts/dz-control-plane-deps + cd charts/dz-control-plane-deps make delete - name: '[terraform] Destroy cluster-extensions' if: always() run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions terraform destroy -auto-approve - name: '[terraform] Destroy base-cluster' if: always() run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster terraform destroy -auto-approve - name: '[aws-cli] clean up volumes explicitly' diff --git a/terraform/examples/aws/base-cluster/main.tf b/terraform/examples/aws/base-cluster/main.tf index cec53913..8dbef33a 100644 --- a/terraform/examples/aws/base-cluster/main.tf +++ b/terraform/examples/aws/base-cluster/main.tf @@ -21,6 +21,9 @@ locals { vpc_dns_resolver = cidrhost(local.effective_vpc_cidr_block, 2) # Calculates the +2 host of the CIDR for VPN DNS resolving + # if ami_version is explicitly set, use that since the user wants to be specific about the AMI being used; if not, use the cluster_version + ami_version = length(var.ami_version) > 0 ? var.ami_version : var.cluster_version + } data "aws_availability_zones" "available" {} @@ -255,7 +258,7 @@ data "aws_ami" "devzero_amazon_eks_node_al2023" { filter { name = "name" - values = ["devzero-amazon-eks-node-al2023-x86_64-standard-${var.cluster_version}-*"] + values = ["devzero-amazon-eks-node-al2023-x86_64-standard-${local.ami_version}-*"] } owners = ["710271940431"] # Devzero public AMIs account most_recent = true @@ -266,7 +269,7 @@ data "aws_ami" "devzero_ubuntu_eks_node_22_04" { filter { name = "name" - values = ["devzero-ubuntu-eks-node-22.04-x86_64-standard-${var.cluster_version}-*"] + values = ["devzero-ubuntu-eks-node-22.04-x86_64-standard-${local.ami_version}-*"] } owners = ["484907513542"] most_recent = true diff --git a/terraform/examples/aws/base-cluster/variables.tf b/terraform/examples/aws/base-cluster/variables.tf index c3e7dcb4..eb41a28e 100644 --- a/terraform/examples/aws/base-cluster/variables.tf +++ b/terraform/examples/aws/base-cluster/variables.tf @@ -161,6 +161,12 @@ variable "cluster_version" { default = "1.30" } +variable "ami_version" { + type = string + description = "AMI version to use for nodes in the EKS deployment" + default = "" +} + variable "region" { type = string description = "AWS region"