From c625951e5420dfcd8f2f04456a7a7c138b68d98e Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Mon, 3 Mar 2025 18:08:34 -0800 Subject: [PATCH 1/8] [self-hosted] ci testing for various k8s versions --- .github/workflows/dsh-testing.yaml | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index dd0b041d..c3b1549c 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -23,9 +23,18 @@ on: jobs: setup-and-test: runs-on: ubuntu-latest + + strategy: + matrix: + eks_version: ["1.25", "1.30", "1.31"] + fail-fast: false + + name: '(base_image: ${{ github.event.inputs.base_image }}) (eks_version: ${{ matrix.eks_version }})' + permissions: id-token: write contents: read + steps: - name: Checkout Repository uses: actions/checkout@v4 @@ -61,7 +70,10 @@ jobs: - name : Generate unique job identifier id : job-identifier shell: bash - run : echo "JOB_IDENTIFIER=gh-ci-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV + run : | + # replace `.` in k8s version with `-` so that the same job identifier can be used in various places + K8S_VERSION=$(echo ${{ matrix.eks_version }} | sed 's/\./-/') + echo "JOB_IDENTIFIER=gh-ci-eks_v${K8S_VERSION}-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV - name: Add Backend Override (Base Cluster) run: | @@ -76,6 +88,12 @@ jobs: } EOF + - name: Set EKS version v${{ matrix.eks_version }} (Base Cluster) + run: | + echo "" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "# Setting eks cluster version" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "cluster_version = \"${{ matrix.eks_version }}\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + - name: Initialize and Apply Terraform (Base Cluster) run: | cd self-hosted/terraform/examples/aws/base-cluster From 57bd08a620ed60f50f72c03d7bb2c850271e4136 Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Tue, 4 Mar 2025 07:46:33 -0800 Subject: [PATCH 2/8] add special casing for ami selection for nodes for eks v1.25 case --- .github/workflows/dsh-testing.yaml | 8 ++++++++ terraform/examples/aws/base-cluster/main.tf | 7 +++++-- terraform/examples/aws/base-cluster/variables.tf | 6 ++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index c3b1549c..7fca280b 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -93,6 +93,14 @@ jobs: echo "" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars echo "# Setting eks cluster version" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars echo "cluster_version = \"${{ matrix.eks_version }}\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + + # DevZero currently doesnt publish a base AMI for Kubernetes 1.25, but local testing has indicated that we can use the 1.30 version + # AMIs available https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#Images:visibility=public-images;imageName=:devzero;v=3 + # TODO (debo): this is currently kind of a hack to make sure that the 1.25 test uses the 1.30 node + if [ "${{ matrix.eks_version }}" = "1.25" ]; then + echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "ami_version = \"1.30\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + fi - name: Initialize and Apply Terraform (Base Cluster) run: | diff --git a/terraform/examples/aws/base-cluster/main.tf b/terraform/examples/aws/base-cluster/main.tf index cec53913..8dbef33a 100644 --- a/terraform/examples/aws/base-cluster/main.tf +++ b/terraform/examples/aws/base-cluster/main.tf @@ -21,6 +21,9 @@ locals { vpc_dns_resolver = cidrhost(local.effective_vpc_cidr_block, 2) # Calculates the +2 host of the CIDR for VPN DNS resolving + # if ami_version is explicitly set, use that since the user wants to be specific about the AMI being used; if not, use the cluster_version + ami_version = length(var.ami_version) > 0 ? var.ami_version : var.cluster_version + } data "aws_availability_zones" "available" {} @@ -255,7 +258,7 @@ data "aws_ami" "devzero_amazon_eks_node_al2023" { filter { name = "name" - values = ["devzero-amazon-eks-node-al2023-x86_64-standard-${var.cluster_version}-*"] + values = ["devzero-amazon-eks-node-al2023-x86_64-standard-${local.ami_version}-*"] } owners = ["710271940431"] # Devzero public AMIs account most_recent = true @@ -266,7 +269,7 @@ data "aws_ami" "devzero_ubuntu_eks_node_22_04" { filter { name = "name" - values = ["devzero-ubuntu-eks-node-22.04-x86_64-standard-${var.cluster_version}-*"] + values = ["devzero-ubuntu-eks-node-22.04-x86_64-standard-${local.ami_version}-*"] } owners = ["484907513542"] most_recent = true diff --git a/terraform/examples/aws/base-cluster/variables.tf b/terraform/examples/aws/base-cluster/variables.tf index c3e7dcb4..eb41a28e 100644 --- a/terraform/examples/aws/base-cluster/variables.tf +++ b/terraform/examples/aws/base-cluster/variables.tf @@ -161,6 +161,12 @@ variable "cluster_version" { default = "1.30" } +variable "ami_version" { + type = string + description = "AMI version to use for nodes in the EKS deployment" + default = "" +} + variable "region" { type = string description = "AWS region" From 9232e4226e121bb7bbfe81cdd0c9dd8b0cf96e3b Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Tue, 4 Mar 2025 07:56:46 -0800 Subject: [PATCH 3/8] shorten the job_identifier --- .github/workflows/dsh-testing.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index 7fca280b..c521835f 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -26,7 +26,7 @@ jobs: strategy: matrix: - eks_version: ["1.25", "1.30", "1.31"] + eks_version: ["1.25"] #, "1.30", "1.31"] fail-fast: false name: '(base_image: ${{ github.event.inputs.base_image }}) (eks_version: ${{ matrix.eks_version }})' @@ -65,7 +65,9 @@ jobs: - name : Add SHORT_SHA Environment Variable id : short-sha shell: bash - run : echo "SHORT_SHA=`git rev-parse --short HEAD`" >> $GITHUB_ENV + run : | + # creating a 4-char long SHA + echo "SHORT_SHA=`git rev-parse --short=3 HEAD`" >> $GITHUB_ENV - name : Generate unique job identifier id : job-identifier @@ -73,7 +75,7 @@ jobs: run : | # replace `.` in k8s version with `-` so that the same job identifier can be used in various places K8S_VERSION=$(echo ${{ matrix.eks_version }} | sed 's/\./-/') - echo "JOB_IDENTIFIER=gh-ci-eks_v${K8S_VERSION}-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV + echo "JOB_IDENTIFIER=gh-${K8S_VERSION}-${{ github.event.inputs.base_image }}-${SHORT_SHA}" >> $GITHUB_ENV - name: Add Backend Override (Base Cluster) run: | @@ -97,7 +99,7 @@ jobs: # DevZero currently doesnt publish a base AMI for Kubernetes 1.25, but local testing has indicated that we can use the 1.30 version # AMIs available https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#Images:visibility=public-images;imageName=:devzero;v=3 # TODO (debo): this is currently kind of a hack to make sure that the 1.25 test uses the 1.30 node - if [ "${{ matrix.eks_version }}" = "1.25" ]; then + if [ "${{ matrix.eks_version }}" == "1.25" ]; then echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars echo "ami_version = \"1.30\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars fi From a08eef03308a73431fdd5c083abdfba53cf0aea8 Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Tue, 4 Mar 2025 12:24:21 -0800 Subject: [PATCH 4/8] tester --- .github/workflows/dsh-testing.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index c521835f..1ccb0787 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -102,8 +102,9 @@ jobs: if [ "${{ matrix.eks_version }}" == "1.25" ]; then echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars echo "ami_version = \"1.30\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars fi - + - name: Initialize and Apply Terraform (Base Cluster) run: | cd self-hosted/terraform/examples/aws/base-cluster From 601068a4ff263dc9b3f2a8e5fb67f05f048b665b Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Tue, 4 Mar 2025 17:25:01 -0800 Subject: [PATCH 5/8] tester --- .github/workflows/dsh-testing.yaml | 55 +++++++++++++----------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index 1ccb0787..547360ff 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -46,12 +46,6 @@ jobs: aws-region: us-west-1 role-duration-seconds: 7200 - - name: Clone DevZero Self-Hosted Repository - env: - GH_PAT: ${{ secrets.GH_TOKEN }} - run: | - git clone https://$GH_PAT@github.com/devzero-inc/self-hosted.git - - name: Set up Terraform uses: hashicorp/setup-terraform@v3 with: @@ -79,7 +73,7 @@ jobs: - name: Add Backend Override (Base Cluster) run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster cat < backend_override.tf terraform { backend "s3" { @@ -92,22 +86,21 @@ jobs: - name: Set EKS version v${{ matrix.eks_version }} (Base Cluster) run: | - echo "" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars - echo "# Setting eks cluster version" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars - echo "cluster_version = \"${{ matrix.eks_version }}\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "# Setting eks cluster version" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "cluster_version = \"${{ matrix.eks_version }}\"" >> terraform/examples/aws/base-cluster/terraform.tfvars # DevZero currently doesnt publish a base AMI for Kubernetes 1.25, but local testing has indicated that we can use the 1.30 version # AMIs available https://us-west-1.console.aws.amazon.com/ec2/home?region=us-west-1#Images:visibility=public-images;imageName=:devzero;v=3 # TODO (debo): this is currently kind of a hack to make sure that the 1.25 test uses the 1.30 node if [ "${{ matrix.eks_version }}" == "1.25" ]; then - echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars - echo "ami_version = \"1.30\"" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars - echo "" >> self-hosted/terraform/examples/aws/base-cluster/terraform.tfvars + echo "# Using ami_version 1.30 for EKS 1.25 as a workaround" >> terraform/examples/aws/base-cluster/terraform.tfvars + echo "ami_version = \"1.30\"" >> terraform/examples/aws/base-cluster/terraform.tfvars fi - + - name: Initialize and Apply Terraform (Base Cluster) run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster terraform init if [ "${{ github.event.inputs.base_image }}" == "al2023" ]; then terraform apply -auto-approve -var="cluster_name=$JOB_IDENTIFIER" @@ -117,7 +110,7 @@ jobs: - name: Update Cluster-Extensions tfvars run: | - cat < self-hosted/terraform/examples/aws/cluster-extensions/terraform.tfvars + cat < terraform/examples/aws/cluster-extensions/terraform.tfvars region = "us-west-1" enable_cluster_autoscaler = false cluster_name = "$JOB_IDENTIFIER" @@ -126,7 +119,7 @@ jobs: - name: Add Backend Override (Cluster Extensions) run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions cat < backend_override.tf terraform { backend "s3" { @@ -139,7 +132,7 @@ jobs: - name: Initialize and Apply Cluster-Extensions run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions terraform init terraform apply -auto-approve @@ -149,7 +142,7 @@ jobs: - name: Deploy Control Plane Dependencies (and modify domains) run: | - cd self-hosted/charts/dz-control-plane-deps + cd charts/dz-control-plane-deps find values -type f -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find values -name "*.bak" -delete make install @@ -161,14 +154,14 @@ jobs: # also setting image.pullsecrets to empty to make sure that each of the deployments dont try to pull their relevant OCI images from this registry # backend license key is ... needed - yq e '.credentials.enable = false | .backend.licenseKey = strenv(BACKEND_LICENSE_KEY) | .image.pullSecrets = []' -i self-hosted/charts/dz-control-plane/values.yaml + yq e '.credentials.enable = false | .backend.licenseKey = strenv(BACKEND_LICENSE_KEY) | .image.pullSecrets = []' -i charts/dz-control-plane/values.yaml - name: Deploy DevZero Control Plane (after configuring kubernetes to use dockerhub creds, and patching all the deployments to point to the right domain) env: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} run: | - cd self-hosted/charts/dz-control-plane + cd charts/dz-control-plane make add-docker-creds find . -name "values.yaml" -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find . -name "values.yaml.bak" -delete make install @@ -177,20 +170,20 @@ jobs: run: | echo -e "\nPods in namespace devzero:" kubectl get pods -n devzero - chmod +x self-hosted/.github/scripts/dsh-pod-test.sh - self-hosted/.github/scripts/dsh-pod-test.sh + chmod +x .github/scripts/dsh-pod-test.sh + .github/scripts/dsh-pod-test.sh echo -e "\nIngress in namespace devzero:" kubectl get ingress -n devzero - name: Deploy Data Plane Dependencies run: | - cd self-hosted/charts/dz-data-plane-deps + cd charts/dz-data-plane-deps find values -type f -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find values -name "*.bak" -delete make install - name: Deploy DevZero Data Plane run: | - cd self-hosted/charts/dz-data-plane + cd charts/dz-data-plane find . -name "values.yaml" -exec sed -i'.bak' "s/example\.com/$JOB_IDENTIFIER\.ci\.selfzero\.net/g" {} \; && find . -name "values.yaml.bak" -delete make install @@ -202,37 +195,37 @@ jobs: - name: '[helm] Destroy data-plane' if: always() run: | - cd self-hosted/charts/dz-data-plane + cd charts/dz-data-plane make delete - name: '[helm] Destroy data-plane-deps' if: always() run: | - cd self-hosted/charts/dz-data-plane-deps + cd charts/dz-data-plane-deps make delete - name: '[helm] Destroy control-plane' if: always() run: | - cd self-hosted/charts/dz-control-plane + cd charts/dz-control-plane make delete - name: '[helm] Destroy control-plane-deps' if: always() run: | - cd self-hosted/charts/dz-control-plane-deps + cd charts/dz-control-plane-deps make delete - name: '[terraform] Destroy cluster-extensions' if: always() run: | - cd self-hosted/terraform/examples/aws/cluster-extensions + cd terraform/examples/aws/cluster-extensions terraform destroy -auto-approve - name: '[terraform] Destroy base-cluster' if: always() run: | - cd self-hosted/terraform/examples/aws/base-cluster + cd terraform/examples/aws/base-cluster terraform destroy -auto-approve - name: '[aws-cli] clean up volumes explicitly' From b21196c6f6b2c718712a6de1147f9bf26f69263d Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Tue, 4 Mar 2025 17:57:54 -0800 Subject: [PATCH 6/8] reintroduce all eks versions --- .github/workflows/dsh-testing.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index 547360ff..f8eb019a 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -26,7 +26,7 @@ jobs: strategy: matrix: - eks_version: ["1.25"] #, "1.30", "1.31"] + eks_version: ["1.25", "1.30", "1.31"] fail-fast: false name: '(base_image: ${{ github.event.inputs.base_image }}) (eks_version: ${{ matrix.eks_version }})' From c74f860530ff66288712da9cb06b311a802fc76c Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Wed, 12 Mar 2025 08:30:10 -0700 Subject: [PATCH 7/8] Update .github/workflows/dsh-testing.yaml --- .github/workflows/dsh-testing.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dsh-testing.yaml b/.github/workflows/dsh-testing.yaml index f8eb019a..04e1bf12 100644 --- a/.github/workflows/dsh-testing.yaml +++ b/.github/workflows/dsh-testing.yaml @@ -26,7 +26,9 @@ jobs: strategy: matrix: - eks_version: ["1.25", "1.30", "1.31"] + # TODO (debo/zvonimir) + # eks_version: ["1.25", "1.30", "1.31"] + eks_version: ["1.30", "1.31"] fail-fast: false name: '(base_image: ${{ github.event.inputs.base_image }}) (eks_version: ${{ matrix.eks_version }})' From 3d8a39c4977761416c85b900c75232b2d107c50a Mon Sep 17 00:00:00 2001 From: Debosmit Ray Date: Sun, 16 Mar 2025 07:53:10 -0700 Subject: [PATCH 8/8] add helper to clean up resources post-ci-failures --- .github/scripts/cleanups-on-ci-failure.sh | 324 ++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100755 .github/scripts/cleanups-on-ci-failure.sh diff --git a/.github/scripts/cleanups-on-ci-failure.sh b/.github/scripts/cleanups-on-ci-failure.sh new file mode 100755 index 00000000..3e645cf6 --- /dev/null +++ b/.github/scripts/cleanups-on-ci-failure.sh @@ -0,0 +1,324 @@ +#!/bin/bash + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Find the Git repository root directory +find_repo_root() { + local dir="$PWD" + while [[ "$dir" != "/" ]]; do + if [[ -d "$dir/.git" ]]; then + echo "$dir" + return 0 + fi + dir="$(dirname "$dir")" + done + + echo "Error: Could not find repository root. Make sure you're running this script from within a Git repository." + exit 1 +} + +# check if AWS CLI is available +if ! command_exists aws; then + echo "Error: AWS CLI is not installed. Please install it and try again." + exit 1 +fi + +# check if Git is available (for finding repo root) +if ! command_exists git; then + echo "Error: Git is not installed. Please install it and try again." + exit 1 +fi + +# check if Terraform is available +if ! command_exists terraform; then + echo "Error: Terraform is not installed. Please install it and try again." + exit 1 +fi + +# check if kubectl is available +if ! command_exists kubectl; then + echo "Warning: kubectl is not installed - needed for Kubernetes resource cleanup." + exit 1 +fi + +# check if jq is available (for JSON parsing) +if ! command_exists jq; then + echo "Warning: jq is not installed. It's recommended for better error handling." +fi + +# AWS environment variables that might interfere with AWS CLI +AWS_ENV_VARS=( + "AWS_ACCESS_KEY_ID" + "AWS_SECRET_ACCESS_KEY" + "AWS_SESSION_TOKEN" + "AWS_SECURITY_TOKEN" + "AWS_DEFAULT_REGION" +) + +ENV_VARS_SET=false +for var in "${AWS_ENV_VARS[@]}"; do + if [ -n "${!var}" ]; then + echo "Warning: $var is set, which may interfere with AWS SSO login." + ENV_VARS_SET=true + fi +done + +if [ "$ENV_VARS_SET" = true ]; then + echo "Please unset these variables before continuing or they may interfere with the AWS profile." + read -p "Do you want to continue anyway? (y/n): " continue_anyway + if [ "$continue_anyway" != "y" ] && [ "$continue_anyway" != "Y" ]; then + echo "Exiting script. Please unset the environment variables and try again." + exit 1 + fi + echo "Continuing with environment variables set..." +else + echo "No interfering AWS environment variables detected." +fi + +# AWS profile self-hosted and us-west-1 (that's where all the test infra is spun up) +export AWS_PROFILE=self-hosted +echo "AWS profile set to: $AWS_PROFILE" + +# AWS SSO login .... +echo "Performing AWS SSO login..." +aws sso login --sso-session devzero --profile $AWS_PROFILE +if [ $? -ne 0 ]; then + echo "Error: AWS SSO login failed. Please try again." + exit 1 +fi + +# S3 bucket for terraform state +S3_BUCKET="dsh-tf-state" +echo "Using S3 bucket: $S3_BUCKET" + +export AWS_REGION=us-west-1 +echo "AWS region set to: $AWS_REGION" + +# find self-hosted repository root +REPO_ROOT=$(find_repo_root) +echo "Repository root found at: $REPO_ROOT" + +# define terraform state paths relative to repo root +BASE_CLUSTER_PATH="$REPO_ROOT/terraform/examples/aws/base-cluster" +CLUSTER_EXTENSIONS_PATH="$REPO_ROOT/terraform/examples/aws/cluster-extensions" + +# Verify terraform directories exist +if [ ! -d "$BASE_CLUSTER_PATH" ]; then + echo "Error: Base cluster directory '$BASE_CLUSTER_PATH' does not exist." + exit 1 +fi + +if [ ! -d "$CLUSTER_EXTENSIONS_PATH" ]; then + echo "Error: Cluster extensions directory '$CLUSTER_EXTENSIONS_PATH' does not exist." + exit 1 +fi + +echo "Terraform directories verified." + +# check and clean up existing state files +check_and_clean_tfstate() { + local dir="$1" + local files_exist=false + + # Check for any terraform state files + if ls "$dir"/terraform.tfstate* 2>/dev/null || ls "$dir"/.terraform.lock.hcl 2>/dev/null || [ -d "$dir"/.terraform ]; then + echo "Existing Terraform state files found in: $dir" + ls -la "$dir"/terraform.tfstate* "$dir"/.terraform.lock.hcl 2>/dev/null + if [ -d "$dir/.terraform" ]; then + echo "Directory $dir/.terraform exists" + fi + files_exist=true + fi + + if [ "$files_exist" = true ]; then + read -p "Do you want to clean up these files before downloading? (y/n): " cleanup + if [ "$cleanup" = "y" ] || [ "$cleanup" = "Y" ]; then + echo "Removing Terraform state files from $dir" + rm -f "$dir"/terraform.tfstate* + rm -f "$dir"/.terraform.lock.hcl + rm -rf "$dir"/.terraform + echo "Cleanup complete." + else + echo "Warning: Existing files may be overwritten or cause conflicts." + fi + else + echo "No existing Terraform state files found in: $dir" + fi +} + +# Function to configure kubectl for a cluster +configure_kubectl() { + local cluster_name="$1" + + echo "Configuring kubectl for cluster: $cluster_name" + + # Update kubeconfig for the cluster + aws eks update-kubeconfig --name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" + if [ $? -ne 0 ]; then + echo "Warning: Failed to update kubeconfig for cluster $cluster_name" + return 1 + fi + + aws eks create-access-entry --cluster-name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" --principal-arn arn:aws:iam::484907513542:role/aws-reserved/sso.amazonaws.com/us-west-2/AWSReservedSSO_AWSAdministratorAccess_cdb3218a34dc613b --type STANDARD + if [ $? -ne 0 ]; then + echo "Warning: Failed to create access-entry for cluster $cluster_name" + return 1 + fi + + aws eks associate-access-policy --cluster-name "$cluster_name" --profile "$AWS_PROFILE" --region "$AWS_REGION" --principal-arn arn:aws:iam::484907513542:role/aws-reserved/sso.amazonaws.com/us-west-2/AWSReservedSSO_AWSAdministratorAccess_cdb3218a34dc613b --access-scope type=cluster --policy-arn arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy + if [ $? -ne 0 ]; then + echo "Warning: Failed to associate policy with access-entry for cluster $cluster_name" + return 1 + fi + + # Test kubectl connection + echo "Testing kubectl connection..." + kubectl get nodes + if [ $? -ne 0 ]; then + echo "Warning: kubectl cannot connect to cluster $cluster_name" + return 1 + fi + + echo "Successfully configured kubectl for cluster $cluster_name" + return 0 +} + +# Function to run terraform destroy with explicit variable +run_terraform_destroy() { + local dir="$1" + local name="$2" + local cluster_name="$3" + + echo "=== Running terraform destroy in $name ===" + + # Change to the directory and run terraform destroy + cd "$dir" + + # Initialize terraform if needed + echo "Initializing Terraform..." + terraform init + if [ $? -ne 0 ]; then + echo "Error: Terraform init failed in $name." + return 1 + fi + + # Run terraform plan to see what would be destroyed + echo "Running terraform plan with explicit cluster_name=\"$cluster_name\"..." + terraform plan -destroy -var="cluster_name=$cluster_name" + + # Confirm before destroying + read -p "Do you want to proceed with terraform destroy? (y/n): " proceed + if [ "$proceed" != "y" ] && [ "$proceed" != "Y" ]; then + echo "Skipping destroy operation." + return 0 + fi + + # Run terraform destroy with explicit variable + echo "Running terraform destroy with explicit cluster_name=\"$cluster_name\"..." + terraform destroy -auto-approve -var="cluster_name=$cluster_name" + if [ $? -ne 0 ]; then + echo "Error: Terraform destroy failed in $name." + return 1 + fi + + echo "Successfully destroyed resources in $name." + return 0 +} + +# get list of failed job identifiers from user +echo "Enter a comma-separated list of job identifiers to process (check env_var 'JOB_IDENTIFIER' in a failed GitHub Action workflow run; e.g.: 'gh-1-30-al2023-c74f'):" +read -r dir_list + +# convert comma-separated list to array cuz loop +IFS=',' read -ra DIRS <<< "$dir_list" + +# Track overall success +CLEANUP_SUCCESS=true + +# Process each job identifier +for dir in "${DIRS[@]}"; do + # trim whitespace cuz user's input random shit + dir=$(echo "$dir" | xargs) + echo "==========================================================" + echo "Processing job identifier: $dir" + echo "==========================================================" + + # Process cluster-extensions first (child resources) + echo "Working on cluster-extensions..." + + # Check and clean up cluster-extensions directory + echo "Checking cluster-extensions directory for existing state files..." + check_and_clean_tfstate "$CLUSTER_EXTENSIONS_PATH" + + # pull terraform.tfstate for cluster-extensions + echo "Downloading terraform state for $dir/cluster-extensions" + aws s3 cp "s3://$S3_BUCKET/$dir/cluster-extensions/terraform.tfstate" "$CLUSTER_EXTENSIONS_PATH/terraform.tfstate" + if [ $? -ne 0 ]; then + echo "Warning: Failed to download terraform.tfstate for $dir/cluster-extensions" + CLEANUP_SUCCESS=false + else + echo "Successfully downloaded terraform.tfstate for $dir/cluster-extensions" + + # Check and clean up any existing .terraform directory + if [ -d "$CLUSTER_EXTENSIONS_PATH/.terraform" ]; then + echo "Removing existing .terraform directory to ensure clean initialization..." + rm -rf "$CLUSTER_EXTENSIONS_PATH/.terraform" + fi + + # configure kubectl for this cluster + configure_kubectl "$dir" + + # run terraform destroy for cluster-extensions + if ! run_terraform_destroy "$CLUSTER_EXTENSIONS_PATH" "cluster-extensions" "$dir"; then + CLEANUP_SUCCESS=false + fi + fi + + # Now process base-cluster (parent resources) + echo "Working on base-cluster..." + + # Check and clean up base-cluster directory + echo "Checking base-cluster directory for existing state files..." + check_and_clean_tfstate "$BASE_CLUSTER_PATH" + + # pull terraform.tfstate for base-cluster + echo "Downloading terraform state for $dir/base-cluster" + aws s3 cp "s3://$S3_BUCKET/$dir/base-cluster/terraform.tfstate" "$BASE_CLUSTER_PATH/terraform.tfstate" + if [ $? -ne 0 ]; then + echo "Warning: Failed to download terraform.tfstate for $dir/base-cluster" + CLEANUP_SUCCESS=false + else + echo "Successfully downloaded terraform.tfstate for $dir/base-cluster" + + # Check and clean up any existing .terraform directory + if [ -d "$BASE_CLUSTER_PATH/.terraform" ]; then + echo "Removing existing .terraform directory to ensure clean initialization..." + rm -rf "$BASE_CLUSTER_PATH/.terraform" + fi + + # Run terraform destroy for base-cluster + if ! run_terraform_destroy "$BASE_CLUSTER_PATH" "base-cluster" "$dir"; then + CLEANUP_SUCCESS=false + fi + fi + + echo "Completed processing job identifier: $dir" + echo "----------------------------------------------------------" +done + +# Return to the original directory +cd "$REPO_ROOT" + +# Final status report +echo "" +echo "==========================================================" +if [ "$CLEANUP_SUCCESS" = true ]; then + echo "✅ All cleanup operations completed successfully!" +else + echo "⚠️ Some cleanup operations failed. Please check the logs above for details." + echo "You may need to manually inspect and clean up some resources." +fi +echo "=========================================================="