wave10 #26

Workflow file for this run

	name: Main CI - Bootstrap & Platform Test

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	workflow_dispatch:

	jobs:
	test-platform:
	runs-on: ubuntu-latest
	timeout-minutes: 60 # Extended timeout for Knative/KServe installation

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Free up disk space
	run: \|
	sudo rm -rf /usr/share/dotnet
	sudo rm -rf /usr/local/lib/android
	sudo rm -rf /opt/ghc
	sudo rm -rf /opt/hostedtoolcache/CodeQL
	sudo docker image prune --all --force
	df -h

	- name: Setup prerequisites
	run: \|
	# Install kubectl
	curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
	chmod +x kubectl
	sudo mv kubectl /usr/local/bin/

	# Install kind
	curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64
	chmod +x ./kind
	sudo mv ./kind /usr/local/bin/

	# Install helm
	curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash

	# Install jq (should already be available)
	sudo apt-get update
	sudo apt-get install -y jq

	# Verify installations
	echo "Tool versions:"
	kubectl version --client
	kind version
	helm version
	jq --version
	docker --version

	- name: Bootstrap platform
	run: \|
	echo "🚀 Starting platform bootstrap..."
	chmod +x scripts/bootstrap.sh

	# Set CI environment variables for bootstrap script
	export CI=true
	export GITHUB_ACTIONS=true

	# Show system resources before bootstrap
	echo "💻 System resources before bootstrap:"
	df -h
	free -h
	docker system df

	# Run bootstrap with enhanced logging
	./scripts/bootstrap.sh

	# Show system resources after bootstrap
	echo "💻 System resources after bootstrap:"
	df -h
	free -h
	docker system df

	- name: Wait for platform readiness
	run: \|
	echo "⏳ Waiting for platform to be fully ready..."

	# Show cluster resources before waiting
	echo "🔍 Cluster status before waiting:"
	kubectl get nodes
	kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded

	# Wait for all pods to be ready with extended timeout
	echo "⏳ Waiting for all pods to be ready..."
	if ! kubectl wait --for=condition=Ready pods --all --all-namespaces --timeout=900s; then
	echo "❌ Some pods failed to become ready, showing status:"
	kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
	kubectl describe pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
	exit 1
	fi

	# Check inference services
	echo "📊 Checking inference services..."
	kubectl get inferenceservice --all-namespaces \|\| echo "No inference services found"

	# Check gateway resources
	echo "🚪 Checking gateway resources..."
	kubectl get gateway,aigatewayroute,aiservicebackend -n envoy-gateway-system \|\| echo "No gateway resources found"

	# Check JWT server
	echo "🔐 Checking JWT server..."
	kubectl get pods -n default -l app=jwt-server \|\| echo "JWT server not found"

	# Show overall cluster status
	echo "🏥 Cluster health check:"
	if kubectl get pods --all-namespaces \| grep -E "(Error\|CrashLoopBackOff\|Pending)"; then
	echo "❌ Found unhealthy pods"
	exit 1
	else
	echo "✅ All pods healthy"
	fi

	- name: Test JWT token generation
	run: \|
	echo "🔑 Testing JWT token generation..."
	chmod +x scripts/get-jwt-tokens.sh
	./scripts/get-jwt-tokens.sh

	# Verify tokens are valid JSON
	./scripts/get-jwt-tokens.sh \| jq '.tokens \| length' > /dev/null \|\| exit 1
	echo "✅ JWT tokens generated successfully"

	- name: Test demo scenarios
	run: \|
	echo "🎭 Running demo scenarios..."
	chmod +x scripts/demo.sh

	# Run each demo scenario non-interactively
	echo "🔒 Testing Security & Authentication Demo..."
	timeout 300 ./scripts/demo.sh --demo security \|\| (echo "❌ Security demo failed" && exit 1)

	echo "⚡ Testing Auto-scaling Demo..."
	timeout 300 ./scripts/demo.sh --demo autoscaling \|\| (echo "❌ Auto-scaling demo failed" && exit 1)

	echo "🚦 Testing Canary Deployment Demo..."
	timeout 300 ./scripts/demo.sh --demo canary \|\| (echo "❌ Canary demo failed" && exit 1)

	echo "🌐 Testing Multi-tenant Isolation Demo..."
	timeout 300 ./scripts/demo.sh --demo multitenancy \|\| (echo "❌ Multi-tenancy demo failed" && exit 1)

	echo "📊 Testing Observability Demo..."
	timeout 300 ./scripts/demo.sh --demo observability \|\| (echo "❌ Observability demo failed" && exit 1)

	echo "✅ All demo scenarios passed!"

	- name: Test model inference endpoints
	run: \|
	echo "🤖 Testing model inference endpoints..."

	# Get JWT tokens
	TOKENS=$(./scripts/get-jwt-tokens.sh)
	TOKEN_A=$(echo "$TOKENS" \| jq -r '.tokens["tenant-a"]')
	TOKEN_C=$(echo "$TOKENS" \| jq -r '.tokens["tenant-c"]')

	# Start port-forward in background
	kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 &
	PF_PID=$!
	sleep 10

	# Test sklearn-iris model (tenant-a)
	echo "🧪 Testing sklearn-iris model..."
	curl -s -f -X POST \
	-H "Content-Type: application/json" \
	-H "Authorization: Bearer $TOKEN_A" \
	-H "x-tenant: tenant-a" \
	-H "x-ai-eg-model: sklearn-iris" \
	http://localhost:8080/v1/models/sklearn-iris:predict \
	-d '{"instances": [[5.1, 3.5, 1.4, 0.2]]}' \| jq .

	# Test pytorch-resnet model (tenant-c)
	echo "🧪 Testing pytorch-resnet model..."
	curl -s -f -X POST \
	-H "Content-Type: application/json" \
	-H "Authorization: Bearer $TOKEN_C" \
	-H "x-tenant: tenant-c" \
	-H "x-ai-eg-model: pytorch-resnet" \
	http://localhost:8080/v1/models/pytorch-resnet:predict \
	-d '{"instances": [[[0.1, 0.2, 0.3]]]}' \| jq .

	# Cleanup port-forward
	kill $PF_PID

	echo "✅ Model inference tests passed!"

	- name: Test security isolation
	run: \|
	echo "🔐 Testing tenant security isolation..."

	# Get JWT tokens
	TOKENS=$(./scripts/get-jwt-tokens.sh)
	TOKEN_A=$(echo "$TOKENS" \| jq -r '.tokens["tenant-a"]')
	TOKEN_C=$(echo "$TOKENS" \| jq -r '.tokens["tenant-c"]')

	# Start port-forward in background
	kubectl port-forward -n istio-system svc/istio-ingressgateway 8080:80 &
	PF_PID=$!
	sleep 10

	# Test that tenant-a cannot access tenant-c model (should fail)
	echo "🚫 Testing cross-tenant access (should fail)..."
	if curl -s -f -X POST \
	-H "Content-Type: application/json" \
	-H "Authorization: Bearer $TOKEN_A" \
	-H "x-tenant: tenant-c" \
	-H "x-ai-eg-model: pytorch-resnet" \
	http://localhost:8080/v1/models/pytorch-resnet:predict \
	-d '{"instances": [[[0.1, 0.2, 0.3]]]}' 2>/dev/null; then
	echo "❌ Security isolation failed - tenant-a accessed tenant-c model"
	kill $PF_PID
	exit 1
	else
	echo "✅ Security isolation working - cross-tenant access denied"
	fi

	# Cleanup port-forward
	kill $PF_PID

	- name: Test observability stack
	run: \|
	echo "📊 Testing observability stack..."

	# Check Prometheus
	kubectl get pods -n monitoring -l app.kubernetes.io/name=prometheus

	# Check Grafana
	kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana

	# Check Kiali
	kubectl get pods -n monitoring -l app.kubernetes.io/name=kiali

	# Test metrics collection
	kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 &
	PF_PID=$!
	sleep 10

	# Query for some basic metrics
	curl -s "http://localhost:9090/api/v1/query?query=up" \| jq '.data.result \| length' > /dev/null \|\| exit 1

	# Cleanup port-forward
	kill $PF_PID

	echo "✅ Observability stack tests passed!"

	- name: Generate test report
	if: always()
	run: \|
	echo "📋 Generating test report..."

	echo "## 🎯 Test Results Summary" > test-report.md
	echo "" >> test-report.md
	echo "### Platform Components Status" >> test-report.md
	echo "\`\`\`" >> test-report.md
	kubectl get pods --all-namespaces >> test-report.md
	echo "\`\`\`" >> test-report.md
	echo "" >> test-report.md

	echo "### Gateway Resources" >> test-report.md
	echo "\`\`\`" >> test-report.md
	kubectl get gateway,aigatewayroute,aiservicebackend -n envoy-gateway-system >> test-report.md
	echo "\`\`\`" >> test-report.md
	echo "" >> test-report.md

	echo "### Inference Services" >> test-report.md
	echo "\`\`\`" >> test-report.md
	kubectl get inferenceservice --all-namespaces >> test-report.md
	echo "\`\`\`" >> test-report.md
	echo "" >> test-report.md

	echo "### Resource Usage" >> test-report.md
	echo "\`\`\`" >> test-report.md
	kubectl top nodes >> test-report.md
	echo "\`\`\`" >> test-report.md

	cat test-report.md

	- name: Cleanup
	if: always()
	run: \|
	echo "🧹 Cleaning up resources..."

	# Show final system resources
	echo "💻 System resources before cleanup:"
	df -h \|\| true
	free -h \|\| true
	docker system df \|\| true

	# Show cluster status for debugging
	echo "🔍 Final cluster status:"
	kubectl get nodes \|\| true
	kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded \|\| true

	# Run cleanup script
	chmod +x scripts/cleanup.sh \|\| true
	./scripts/cleanup.sh \|\| echo "Cleanup completed with warnings"

	# Additional cleanup
	docker system prune -f \|\| true
	kind delete clusters --all \|\| echo "No clusters to delete"

	# Show final disk usage
	echo "💻 Final disk usage:"
	df -h \|\| true

	- name: Upload test artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results
	path: \|
	test-report.md
	/tmp/kind-logs-*
	retention-days: 30

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

wave10 #26

Workflow file

wave10 #26

Uh oh!

Jobs

Run details

Workflow file for this run