Skip to content

[tmpnet] Enable installation of chaos mesh to local kind cluster #3674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,13 @@ jobs:
prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }}
loki_username: ${{ secrets.LOKI_ID || '' }}
loki_password: ${{ secrets.LOKI_PASSWORD || '' }}
robustness:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/setup-go-for-project
- uses: ./.github/actions/install-nix
# TODO(marun) Extend testing of robustness beyond deploying a suitable test environment
- name: Deploy kind with chaos mesh
shell: bash
run: nix develop --command ./scripts/run_task.sh test-robustness
5 changes: 5 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,11 @@ tasks:
- task: generate-load-contract-bindings
- cmd: bash -x ./scripts/tests.load.kube.kind.sh {{.CLI_ARGS}}

test-robustness:
desc: Deploys kind with chaos mesh. Intended to eventually run a robustness (fault-injection) test suite.
cmds:
- ./bin/tmpnetctl start-kind-cluster --install-chaos-mesh

test-unit:
desc: Runs unit tests
# Invoking with bash ensures compatibility with CI execution on Windows
Expand Down
2 changes: 1 addition & 1 deletion scripts/kind-with-registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ nodes:
- role: control-plane
extraPortMappings:
# Exposing a nodeport for nginx ingress is the reason this script needed to be copied and customized
# This port must match the value used to deploy the nginx controller by tests/fixture/tmpnet/start-kind-cluster.go
# This port must match the ingressNodePort constant in tests/fixture/tmpnet/start_kind_cluster.go
- containerPort: 30791
hostPort: 30791
protocol: TCP
Expand Down
182 changes: 143 additions & 39 deletions tests/fixture/tmpnet/start_kind_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ const (
ingressChartRepo = "https://kubernetes.github.io/ingress-nginx"
ingressChartName = "ingress-nginx/ingress-nginx"
ingressControllerName = "ingress-nginx-controller"
// This must match the nodePort configured in scripts/kind-with-registry.sh
ingressNodePort = 30791

// Chaos Mesh constants
chaosMeshNamespace = "chaos-mesh"
chaosMeshReleaseName = "chaos-mesh"
chaosMeshChartRepo = "https://charts.chaos-mesh.org"
chaosMeshChartName = "chaos-mesh/chaos-mesh"
chaosMeshChartVersion = "2.7.2"
chaosMeshControllerName = "chaos-controller-manager"
chaosMeshDashboardName = "chaos-dashboard"
chaosMeshDashboardHost = "chaos-mesh.localhost"
)

//go:embed yaml/tmpnet-rbac.yaml
Expand All @@ -57,6 +69,7 @@ func StartKindCluster(
configPath string,
startMetricsCollector bool,
startLogsCollector bool,
installChaosMesh bool,
) error {
configContext := KindKubeconfigContext

Expand Down Expand Up @@ -116,6 +129,12 @@ func StartKindCluster(
return fmt.Errorf("failed to create defaults ConfigMap: %w", err)
}

if installChaosMesh {
if err := deployChaosMesh(ctx, log, configPath, configContext); err != nil {
return fmt.Errorf("failed to deploy chaos mesh: %w", err)
}
}

return nil
}

Expand Down Expand Up @@ -342,7 +361,7 @@ func deployIngressController(ctx context.Context, log logging.Logger, configPath
"--wait",
"--set", "controller.service.type=NodePort",
// This port value must match the port configured in scripts/kind-with-registry.sh
"--set", "controller.service.nodePorts.http=30791",
"--set", fmt.Sprintf("controller.service.nodePorts.http=%d", ingressNodePort),
"--set", "controller.admissionWebhooks.enabled=false",
"--set", "controller.config.proxy-read-timeout=600",
"--set", "controller.config.proxy-send-timeout=600",
Expand All @@ -355,7 +374,7 @@ func deployIngressController(ctx context.Context, log logging.Logger, configPath
return fmt.Errorf("failed to install nginx-ingress: %w", err)
}

return waitForIngressController(ctx, log, configPath, configContext)
return waitForDeployment(ctx, log, configPath, configContext, ingressNamespace, ingressControllerName, "nginx ingress controller")
}

// isIngressControllerRunning checks if the nginx ingress controller is already running.
Expand All @@ -371,42 +390,6 @@ func isIngressControllerRunning(ctx context.Context, log logging.Logger, configP
return isRunning, nil
}

// waitForIngressController waits for the nginx ingress controller to be ready.
func waitForIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error {
clientset, err := GetClientset(log, configPath, configContext)
if err != nil {
return fmt.Errorf("failed to get clientset: %w", err)
}

return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) {
deployment, err := clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{})
if err != nil {
log.Debug("failed to get nginx ingress controller deployment",
zap.String("namespace", ingressNamespace),
zap.String("deployment", ingressControllerName),
zap.Error(err),
)
return false, nil
}
if deployment.Status.ReadyReplicas == 0 {
log.Debug("waiting for nginx ingress controller to become ready",
zap.String("namespace", ingressNamespace),
zap.String("deployment", ingressControllerName),
zap.Int32("readyReplicas", deployment.Status.ReadyReplicas),
zap.Int32("replicas", deployment.Status.Replicas),
)
return false, nil
}

log.Info("nginx ingress controller is ready",
zap.String("namespace", ingressNamespace),
zap.String("deployment", ingressControllerName),
zap.Int32("readyReplicas", deployment.Status.ReadyReplicas),
)
return true, nil
})
}

// runHelmCommand runs a Helm command with the given arguments.
func runHelmCommand(ctx context.Context, args ...string) error {
cmd := exec.CommandContext(ctx, "helm", args...)
Expand Down Expand Up @@ -448,7 +431,7 @@ func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath
Namespace: namespace,
},
Data: map[string]string{
ingressHostKey: "localhost:30791",
ingressHostKey: fmt.Sprintf("localhost:%d", ingressNodePort),
},
}

Expand All @@ -459,3 +442,124 @@ func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath

return nil
}

// deployChaosMesh deploys Chaos Mesh using Helm.
func deployChaosMesh(ctx context.Context, log logging.Logger, configPath string, configContext string) error {
log.Info("checking if chaos mesh is already running")

isRunning, err := isChaosMeshRunning(ctx, log, configPath, configContext)
if err != nil {
return fmt.Errorf("failed to check chaos mesh status: %w", err)
}
if isRunning {
log.Info("chaos mesh already running")
return nil
}

log.Info("deploying chaos mesh using Helm")

// Add the helm repo for chaos-mesh
if err := runHelmCommand(ctx, "repo", "add", "chaos-mesh", chaosMeshChartRepo); err != nil {
return fmt.Errorf("failed to add chaos mesh helm repo: %w", err)
}
if err := runHelmCommand(ctx, "repo", "update"); err != nil {
return fmt.Errorf("failed to update helm repos: %w", err)
}

// Install Chaos Mesh with all required settings including ingress
args := []string{
"install",
chaosMeshReleaseName,
chaosMeshChartName,
"--namespace", chaosMeshNamespace,
"--create-namespace",
"--version", chaosMeshChartVersion,
"--wait",
"--set", "chaosDaemon.runtime=containerd",
"--set", "chaosDaemon.socketPath=/run/containerd/containerd.sock",
"--set", "dashboard.persistentVolume.enabled=true",
"--set", "dashboard.persistentVolume.storageClass=standard",
"--set", "dashboard.securityMode=false",
"--set", "controllerManager.leaderElection.enabled=false",
"--set", "dashboard.ingress.enabled=true",
"--set", "dashboard.ingress.ingressClassName=nginx",
"--set", "dashboard.ingress.hosts[0].name=" + chaosMeshDashboardHost,
}

if err := runHelmCommand(ctx, args...); err != nil {
return fmt.Errorf("failed to install chaos mesh: %w", err)
}

// Wait for Chaos Mesh to be ready
if err := waitForChaosMesh(ctx, log, configPath, configContext); err != nil {
return fmt.Errorf("chaos mesh deployment failed: %w", err)
}

// Log access information
log.Info("Chaos Mesh installed successfully",
zap.String("dashboardURL", fmt.Sprintf("http://%s:%d", chaosMeshDashboardHost, ingressNodePort)),
)
log.Warn("Chaos Mesh dashboard security is disabled - use only for local development")

return nil
}

// isChaosMeshRunning checks if Chaos Mesh is already running.
func isChaosMeshRunning(ctx context.Context, log logging.Logger, configPath string, configContext string) (bool, error) {
clientset, err := GetClientset(log, configPath, configContext)
if err != nil {
return false, err
}

// Check if controller manager deployment exists
_, err = clientset.AppsV1().Deployments(chaosMeshNamespace).Get(ctx, chaosMeshControllerName, metav1.GetOptions{})
return !apierrors.IsNotFound(err), nil
}

// waitForChaosMesh waits for Chaos Mesh components to be ready.
func waitForChaosMesh(ctx context.Context, log logging.Logger, configPath string, configContext string) error {
// Wait for controller manager
if err := waitForDeployment(ctx, log, configPath, configContext, chaosMeshNamespace, chaosMeshControllerName, "chaos mesh controller manager"); err != nil {
return fmt.Errorf("controller manager not ready: %w", err)
}

// Wait for dashboard
return waitForDeployment(ctx, log, configPath, configContext, chaosMeshNamespace, chaosMeshDashboardName, "chaos mesh dashboard")
}

// waitForDeployment waits for a deployment to have at least one ready replica.
func waitForDeployment(ctx context.Context, log logging.Logger, configPath string, configContext string, namespace string, deploymentName string, displayName string) error {
clientset, err := GetClientset(log, configPath, configContext)
if err != nil {
return fmt.Errorf("failed to get clientset: %w", err)
}

log.Info("waiting for " + displayName + " to be ready")
return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) {
deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
if err != nil {
log.Debug("failed to get "+displayName+" deployment",
zap.String("namespace", namespace),
zap.String("deployment", deploymentName),
zap.Error(err),
)
return false, nil
}
if deployment.Status.ReadyReplicas == 0 {
log.Debug("waiting for "+displayName+" to become ready",
zap.String("namespace", namespace),
zap.String("deployment", deploymentName),
zap.Int32("readyReplicas", deployment.Status.ReadyReplicas),
zap.Int32("replicas", deployment.Status.Replicas),
)
return false, nil
}

log.Info(displayName+" is ready",
zap.String("namespace", namespace),
zap.String("deployment", deploymentName),
zap.Int32("readyReplicas", deployment.Status.ReadyReplicas),
)
return true, nil
})
}
17 changes: 13 additions & 4 deletions tests/fixture/tmpnet/tmpnetctl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"io/fs"
"os"
"path/filepath"
"time"

"github.com/spf13/cobra"
"go.uber.org/zap"
Expand All @@ -21,7 +22,12 @@ import (
"github.com/ava-labs/avalanchego/version"
)

const cliVersion = "0.0.1"
const (
cliVersion = "0.0.1"

// Need a longer timeout to account for time required to deploy nginx ingress controller and chaos mesh
startKindClusterTimeout = 5 * time.Minute
)

var (
errNetworkDirRequired = fmt.Errorf("--network-dir or %s is required", tmpnet.NetworkDirEnvName)
Expand Down Expand Up @@ -271,14 +277,15 @@ func main() {
rootCmd.AddCommand(checkLogsCmd)

var (
kubeconfigVars *flags.KubeconfigVars
collectorVars *flags.CollectorVars
kubeconfigVars *flags.KubeconfigVars
collectorVars *flags.CollectorVars
installChaosMesh bool
)
startKindClusterCmd := &cobra.Command{
Use: "start-kind-cluster",
Short: "Starts a local kind cluster with an integrated registry",
RunE: func(*cobra.Command, []string) error {
ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout)
ctx, cancel := context.WithTimeout(context.Background(), startKindClusterTimeout)
defer cancel()
log, err := tests.LoggerForFormat("", rawLogFormat)
if err != nil {
Expand All @@ -302,11 +309,13 @@ func main() {
kubeconfigVars.Path,
collectorVars.StartMetricsCollector,
collectorVars.StartLogsCollector,
installChaosMesh,
)
},
}
kubeconfigVars = flags.NewKubeconfigFlagSetVars(startKindClusterCmd.PersistentFlags())
collectorVars = flags.NewCollectorFlagSetVars(startKindClusterCmd.PersistentFlags())
startKindClusterCmd.PersistentFlags().BoolVar(&installChaosMesh, "install-chaos-mesh", false, "Install Chaos Mesh in the kind cluster")
rootCmd.AddCommand(startKindClusterCmd)

if err := rootCmd.Execute(); err != nil {
Expand Down