Smoke Tests for SageMaker #60
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Smoke Tests for SageMaker | |
| on: | |
| workflow_dispatch: | |
| jobs: | |
| smoke-tests-sagemaker: | |
| name: Run Smoke Tests for SageMaker | |
| runs-on: ubuntu-latest | |
| environment: sagemaker-e2e-tests-workflow-env | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| COMMIT_SHA: ${{ github.sha }} | |
| GH_REF_NAME: ${{ github.ref_name }} | |
| SAGEMAKER_ARTIFACT_PREFIX: "code-editor-sagemaker-server" | |
| GH_TOKEN: ${{ github.token }} | |
| AWS_REGION: us-east-1 | |
| AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
| ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY }} | |
| PROJECT_NAME: ${{ secrets.PROJECT_NAME }} | |
| DATAZONE_DOMAIN_ID: ${{ secrets.DATAZONE_DOMAIN_ID }} | |
| TEST_SAGEMAKER_ROLE: ${{ secrets.TEST_SAGEMAKER_ROLE }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Setup environment | |
| run: | | |
| echo "Installing required dependencies" | |
| sudo apt-get update | |
| sudo apt-get install -y quilt libxml2-utils jq libx11-dev libxkbfile-dev | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '22' | |
| - name: Download sagemaker build artifact | |
| run: | | |
| gh run download --name "$COMMIT_SHA-code-editor-sagemaker-server-build" | |
| - name: Check build artifacts exist | |
| run: | | |
| ls -la | |
| FILES=( | |
| "$SAGEMAKER_ARTIFACT_PREFIX-build.tar.gz" | |
| ) | |
| # Check build artifact exists | |
| for file in "${FILES[@]}"; do | |
| if [ ! -f "$file" ]; then | |
| echo "Error: $file not found for commit $COMMIT_SHA" | |
| exit 1 | |
| fi | |
| done | |
| - name: Configure ECR role AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.TEST_ECR_ROLE }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Extract artifacts | |
| run: | | |
| tar -xzf "$SAGEMAKER_ARTIFACT_PREFIX-build.tar.gz" | |
| - name: Build and push Docker image | |
| run: | | |
| # Login to ECR | |
| ECR_REGISTRY="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com" | |
| aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $ECR_REGISTRY | |
| # Create image tag with branch-commit format | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| IMAGE_TAG="$BRANCH_NAME-$COMMIT_SHA" | |
| # Build image | |
| docker build -f .github/workflows/dockerfiles/Dockerfile.sagemaker -t $ECR_REPOSITORY:$IMAGE_TAG . | |
| # Tag image | |
| docker tag $ECR_REPOSITORY:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | |
| # Push image | |
| docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG > /dev/null | |
| echo "Docker image pushed successfully" | |
| # Get and store the image SHA digest | |
| IMAGE_SHA=$(docker inspect --format='{{index .RepoDigests 0}}' $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | cut -d'@' -f2) | |
| IMAGE_URI="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY@$IMAGE_SHA" | |
| echo "ECR_IMAGE_URI=$IMAGE_URI" >> $GITHUB_ENV | |
| echo "Image pushed successfully with SHA: ${IMAGE_SHA:0:12}..." | |
| # Clean up local Docker images and build artifacts to free disk space | |
| docker rmi $ECR_REPOSITORY:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com/$ECR_REPOSITORY:$IMAGE_TAG | |
| echo "Local Docker images cleaned up" | |
| rm -rf vscode-reh-web-linux-x64 | |
| rm -rf $SAGEMAKER_ARTIFACT_PREFIX-build | |
| echo "Local build artifacts cleaned up" | |
| - name: Configure SageMaker role AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.TEST_SAGEMAKER_ROLE }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Create SageMaker code editor image | |
| run: | | |
| # Fetch DataZone project ID | |
| PROJECT_ID=$(aws datazone list-projects --domain-identifier "$DATAZONE_DOMAIN_ID" --name "$PROJECT_NAME" --query 'items[0].id' --output text) | |
| if [ -z "$PROJECT_ID" ] || [ "$PROJECT_ID" = "None" ]; then | |
| echo "Error: DataZone project not found" | |
| exit 1 | |
| fi | |
| echo "DataZone project found successfully" | |
| echo "::add-mask::$PROJECT_ID" | |
| # Find SageMaker domain by project ID (domain name contains project ID) | |
| DOMAIN_ID=$(aws sagemaker list-domains --query "Domains[?contains(DomainName, '$PROJECT_ID')].DomainId" --output text) | |
| if [ -z "$DOMAIN_ID" ]; then | |
| echo "Error: SageMaker domain not found for project ID" | |
| exit 1 | |
| fi | |
| echo "SageMaker domain found successfully" | |
| echo "::add-mask::$DOMAIN_ID" | |
| echo "SAGEMAKER_DOMAIN_ID=$DOMAIN_ID" >> $GITHUB_ENV | |
| echo "PROJECT_ID=$PROJECT_ID" >> $GITHUB_ENV | |
| # Create SageMaker image | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| IMAGE_NAME="$BRANCH_NAME-${COMMIT_SHA}" | |
| aws sagemaker create-image \ | |
| --image-name "$IMAGE_NAME" \ | |
| --role-arn $TEST_SAGEMAKER_ROLE | |
| # Wait for image to be ready (max 30 retries = 5 minutes) | |
| echo "Waiting for SageMaker image to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=30 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| STATUS=$(aws sagemaker describe-image --image-name "$IMAGE_NAME" --query 'ImageStatus' --output text) | |
| echo "Image status: $STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$STATUS" = "CREATED" ]; then | |
| break | |
| elif [ "$STATUS" = "CREATE_FAILED" ]; then | |
| echo "Image creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for image to be ready" | |
| exit 1 | |
| fi | |
| # Create image version | |
| aws sagemaker create-image-version \ | |
| --image-name "$IMAGE_NAME" \ | |
| --base-image "$ECR_IMAGE_URI" | |
| echo "SAGEMAKER_IMAGE_NAME=$IMAGE_NAME" >> $GITHUB_ENV | |
| - name: Attach image to domain | |
| run: | | |
| # Wait for image version to be ready (100 seconds) | |
| echo "Waiting for image version to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=10 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| VERSION_STATUS=$(aws sagemaker describe-image-version --image-name "$SAGEMAKER_IMAGE_NAME" --query 'ImageVersionStatus' --output text) | |
| echo "Image version status: $VERSION_STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$VERSION_STATUS" = "CREATED" ]; then | |
| break | |
| elif [ "$VERSION_STATUS" = "CREATE_FAILED" ]; then | |
| echo "Image version creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for image version to be ready" | |
| exit 1 | |
| fi | |
| # Create universal AppImageConfig (if it doesn't exist) | |
| APP_IMAGE_CONFIG_NAME="code-editor-app-config" | |
| if ! aws sagemaker describe-app-image-config --app-image-config-name "$APP_IMAGE_CONFIG_NAME" >/dev/null 2>&1; then | |
| aws sagemaker create-app-image-config \ | |
| --app-image-config-name "$APP_IMAGE_CONFIG_NAME" \ | |
| --code-editor-app-image-config '{}' | |
| echo "Created universal AppImageConfig: $APP_IMAGE_CONFIG_NAME" | |
| else | |
| echo "Universal AppImageConfig already exists: $APP_IMAGE_CONFIG_NAME" | |
| fi | |
| # Get existing custom images and append new one | |
| EXISTING_IMAGES=$(aws sagemaker describe-domain --domain-id "$SAGEMAKER_DOMAIN_ID" --query 'DefaultUserSettings.CodeEditorAppSettings.CustomImages' --output json 2>/dev/null || echo '[]') | |
| # Create new custom images array with existing + new image | |
| NEW_IMAGES=$(echo "$EXISTING_IMAGES" | jq --arg imageName "$SAGEMAKER_IMAGE_NAME" --arg configName "$APP_IMAGE_CONFIG_NAME" '. + [{"ImageName": $imageName, "ImageVersionNumber": 1, "AppImageConfigName": $configName}] | unique_by(.ImageName)') | |
| # Update domain with all custom images | |
| aws sagemaker update-domain \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --default-user-settings "{\"CodeEditorAppSettings\": {\"CustomImages\": $NEW_IMAGES}}" | |
| echo "Image attached to domain successfully" | |
| - name: Create SageMaker code editor space | |
| run: | | |
| # Create space name using branch-commit format | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| SPACE_NAME="$BRANCH_NAME-${COMMIT_SHA}" | |
| # Create the space with project ownership | |
| aws sagemaker create-space \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SPACE_NAME" \ | |
| --ownership-settings '{ | |
| "OwnerUserProfileName": "'$(aws sagemaker list-user-profiles --domain-id "$SAGEMAKER_DOMAIN_ID" --query 'UserProfiles[0].UserProfileName' --output text)'" | |
| }' \ | |
| --space-sharing-settings '{ | |
| "SharingType": "Private" | |
| }' \ | |
| --space-settings '{ | |
| "AppType": "CodeEditor", | |
| "RemoteAccess": "DISABLED", | |
| "SpaceStorageSettings": { | |
| "EbsStorageSettings": { | |
| "EbsVolumeSizeInGb": 16 | |
| } | |
| }, | |
| "CodeEditorAppSettings": { | |
| "DefaultResourceSpec": { | |
| "SageMakerImageArn": "arn:aws:sagemaker:'$AWS_REGION':'$AWS_ACCOUNT_ID':image/'$SAGEMAKER_IMAGE_NAME'", | |
| "InstanceType": "ml.t3.medium" | |
| } | |
| } | |
| }' | |
| echo "Created SageMaker space: $SPACE_NAME" | |
| echo "SAGEMAKER_SPACE_NAME=$SPACE_NAME" >> $GITHUB_ENV | |
| - name: Start SageMaker code editor space | |
| run: | | |
| # Wait for space to be ready (200 seconds) | |
| echo "Waiting for space to be ready..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=20 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| SPACE_STATUS=$(aws sagemaker describe-space --domain-id "$SAGEMAKER_DOMAIN_ID" --space-name "$SAGEMAKER_SPACE_NAME" --query 'Status' --output text) | |
| echo "Space status: $SPACE_STATUS (attempt $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$SPACE_STATUS" = "InService" ]; then | |
| break | |
| elif [ "$SPACE_STATUS" = "Failed" ]; then | |
| echo "Space creation failed" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for space to be ready" | |
| exit 1 | |
| fi | |
| # Create app to start the space | |
| aws sagemaker create-app \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SAGEMAKER_SPACE_NAME" \ | |
| --app-type "CodeEditor" \ | |
| --app-name "default" | |
| echo "Started SageMaker space: $SAGEMAKER_SPACE_NAME" | |
| - name: Fetch source artifact | |
| run: | | |
| gh run download --name "$COMMIT_SHA-code-editor-sagemaker-server-src" | |
| ls -la | |
| FILES=( | |
| "$SAGEMAKER_ARTIFACT_PREFIX-src.tar.gz" | |
| ) | |
| # Check build artifact exists | |
| for file in "${FILES[@]}"; do | |
| if [ ! -f "$file" ]; then | |
| echo "Error: $file not found for commit $COMMIT_SHA" | |
| exit 1 | |
| fi | |
| done | |
| tar -xzf "$SAGEMAKER_ARTIFACT_PREFIX-src.tar.gz" | |
| rm -rf $SAGEMAKER_ARTIFACT_PREFIX-src | |
| - name: E2E testing | |
| env: | |
| SSO_USERNAME: ${{ secrets.SSO_USERNAME }} | |
| SSO_PASSWORD: ${{ secrets.SSO_PASSWORD }} | |
| run: | | |
| # Waiting for app to be in service for 8 minutes | |
| echo "Waiting for app to be in service..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=16 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| APP_STATUS=$(aws sagemaker describe-app --domain-id "$SAGEMAKER_DOMAIN_ID" --space-name "$SAGEMAKER_SPACE_NAME" --app-type "CodeEditor" --app-name "default" --query 'Status' --output text 2>/dev/null || echo "NotFound") | |
| echo "App status: $APP_STATUS (check $((RETRY_COUNT + 1))/$MAX_RETRIES)" | |
| if [ "$APP_STATUS" = "InService" ]; then | |
| echo "App is running successfully" | |
| break | |
| elif [ "$APP_STATUS" = "Failed" ]; then | |
| echo "App failed to start" | |
| exit 1 | |
| elif [ "$APP_STATUS" = "Deleted" ]; then | |
| echo "App was deleted" | |
| exit 1 | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then | |
| sleep 30 | |
| fi | |
| done | |
| if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then | |
| echo "Timeout waiting for app to be in service" | |
| exit 1 | |
| fi | |
| DATAZONE_URL="https://${DATAZONE_DOMAIN_ID}.sagemaker.${AWS_REGION}.on.aws/projects/${PROJECT_ID}/compute?type=spaces" | |
| # Apply E2E testing patches | |
| cd code-editor-src | |
| rm -rf .pc | |
| export QUILT_PATCHES=../patches/test | |
| export QUILT_SERIES=../patches/test/sagemaker-testing.series | |
| quilt push -a -f | |
| # Run smoke tests with DataZone authentication | |
| cd test/smoke | |
| npm i | |
| cd .. | |
| npm install mocha | |
| cd smoke | |
| npm run compile | |
| npx playwright install | |
| SSO_USERNAME="$SSO_USERNAME" SSO_PASSWORD="$SSO_PASSWORD" DATAZONE_URL="$DATAZONE_URL" SPACE_NAME="$SAGEMAKER_SPACE_NAME" VSCODE_REMOTE_SERVER_PATH="sagemaker" npm run mocha -- --web --headless 2>&1 | tee test_output.log | |
| if grep -q "failing" test_output.log; then | |
| echo "Smoke tests failed - detected failing tests in output" | |
| exit 1 | |
| fi | |
| echo "All smoke tests passed successfully" | |
| - name: Clean up SageMaker resources | |
| if: always() | |
| run: | | |
| echo "Starting SageMaker cleanup..." | |
| CLEANUP_FAILED=0 | |
| # Delete app first | |
| echo "Deleting app..." | |
| aws sagemaker delete-app \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SAGEMAKER_SPACE_NAME" \ | |
| --app-type "CodeEditor" \ | |
| --app-name "default" || CLEANUP_FAILED=1 | |
| # Wait for app to be deleted | |
| echo "Waiting for app to be deleted..." | |
| RETRY_COUNT=0 | |
| MAX_RETRIES=10 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| APP_STATUS=$(aws sagemaker describe-app --domain-id "$SAGEMAKER_DOMAIN_ID" --space-name "$SAGEMAKER_SPACE_NAME" --app-type "CodeEditor" --app-name "default" --query 'Status' --output text 2>/dev/null || echo "Deleted") | |
| echo "App status: $APP_STATUS" | |
| if [ "$APP_STATUS" = "Deleted" ]; then | |
| break | |
| fi | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| sleep 10 | |
| done | |
| # Delete space | |
| echo "Deleting space..." | |
| aws sagemaker delete-space \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --space-name "$SAGEMAKER_SPACE_NAME" || CLEANUP_FAILED=1 | |
| # Remove image from domain | |
| echo "Removing image from domain..." | |
| EXISTING_IMAGES=$(aws sagemaker describe-domain --domain-id "$SAGEMAKER_DOMAIN_ID" --query 'DefaultUserSettings.CodeEditorAppSettings.CustomImages' --output json 2>/dev/null || echo '[]') | |
| FILTERED_IMAGES=$(echo "$EXISTING_IMAGES" | jq --arg imageName "$SAGEMAKER_IMAGE_NAME" 'map(select(.ImageName != $imageName))') | |
| aws sagemaker update-domain \ | |
| --domain-id "$SAGEMAKER_DOMAIN_ID" \ | |
| --default-user-settings "{\"CodeEditorAppSettings\": {\"CustomImages\": $FILTERED_IMAGES}}" || CLEANUP_FAILED=1 | |
| # Delete SageMaker image version | |
| echo "Deleting SageMaker image version..." | |
| aws sagemaker delete-image-version \ | |
| --image-name "$SAGEMAKER_IMAGE_NAME" \ | |
| --version 1 || CLEANUP_FAILED=1 | |
| # Delete SageMaker image | |
| echo "Deleting SageMaker image..." | |
| aws sagemaker delete-image \ | |
| --image-name "$SAGEMAKER_IMAGE_NAME" || CLEANUP_FAILED=1 | |
| if [ $CLEANUP_FAILED -eq 1 ]; then | |
| echo "SageMaker cleanup completed with some failures" | |
| else | |
| echo "SageMaker cleanup completed successfully" | |
| fi | |
| - name: Configure ECR role for cleanup | |
| if: always() | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.TEST_ECR_ROLE }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| - name: Clean up ECR resources | |
| if: always() | |
| run: | | |
| echo "Starting ECR cleanup..." | |
| BRANCH_NAME=$(echo "$GH_REF_NAME" | sed 's/[^a-zA-Z0-9-]/-/g') | |
| IMAGE_TAG="$BRANCH_NAME-$COMMIT_SHA" | |
| aws ecr batch-delete-image \ | |
| --repository-name "$ECR_REPOSITORY" \ | |
| --image-ids imageTag="$IMAGE_TAG" || echo "ECR cleanup failed" | |
| echo "ECR cleanup completed" | |