Skip to content

Commit dfb7608

Browse files
Merge remote-tracking branch 'upstream/main' into mongo-migration-doc
2 parents 1d387b9 + 1584015 commit dfb7608

File tree

20 files changed

+1242
-41
lines changed

20 files changed

+1242
-41
lines changed
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Integration Tests - GCP
16+
17+
on:
18+
workflow_dispatch: {} # allow manual runs for testing
19+
schedule:
20+
- cron: '30 14 * * *' # daily at 14:30 UTC, runs on default branch only
21+
push:
22+
branches:
23+
- main
24+
- feature/oidc-gcp
25+
26+
permissions:
27+
contents: read
28+
actions: read
29+
id-token: write
30+
31+
jobs:
32+
integration-test-gcp:
33+
runs-on: ubuntu-latest
34+
timeout-minutes: 60
35+
env:
36+
CSP: "gcp"
37+
PREFIX: "nvs"
38+
PROJECT_ID: "nv-dgxck8s-20250306"
39+
IDENTITY_PROVIDER: "projects/1015254933832/locations/global/workloadIdentityPools/github-pool/providers/github-provider"
40+
SERVICE_ACCOUNT: "github-actions-user"
41+
# Terraform Vars
42+
TF_VAR_deployment_id: "d${{ github.run_id }}"
43+
TF_VAR_project_id: "nv-dgxck8s-20250306"
44+
TF_VAR_region: "europe-west4"
45+
TF_VAR_zone: "europe-west4-b"
46+
TF_VAR_system_node_type: "e2-standard-4"
47+
TF_VAR_system_node_count: "3"
48+
TF_VAR_gpu_node_pool_name: "gpu-pool"
49+
TF_VAR_gpu_machine_type: "a3-megagpu-8g"
50+
TF_VAR_gpu_node_count: "1"
51+
TF_VAR_gpu_reservation_project: "nv-dgxcloudprodgsc-20240206"
52+
TF_VAR_gpu_reservation_name: "gsc-a3-megagpu-8g-shared-res-2"
53+
TF_VAR_gpu_driver_version: "INSTALLATION_DISABLED"
54+
TF_VAR_resource_labels: '{"environment":"test","team":"nvsentinel","managed_by":"terraform"}'
55+
# Debug
56+
SKIP_DELETE: "false" # skip cluster deletion
57+
TEST_TAG: "main-33c1d03"
58+
59+
steps:
60+
# Checkout
61+
- name: Checkout
62+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
63+
64+
# Terraform
65+
- name: Terraform
66+
uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2
67+
with:
68+
terraform_version: "1.13.5"
69+
70+
# Auth
71+
- name: Get AuthN Token
72+
id: auth
73+
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3
74+
with:
75+
token_format: access_token
76+
workload_identity_provider: ${{ env.IDENTITY_PROVIDER }}
77+
service_account: "${{ env.SERVICE_ACCOUNT }}@${{ env.PROJECT_ID }}.iam.gserviceaccount.com"
78+
79+
# Gcloud
80+
- name: Setup gcloud CLI
81+
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db # v3.0.1
82+
83+
# Cluster
84+
- name: Create Cluster
85+
id: cluster
86+
shell: bash
87+
continue-on-error: true
88+
run: |
89+
set -euo pipefail
90+
cd tests/uat/gcp/cluster
91+
terraform init
92+
terraform apply -auto-approve
93+
94+
# Connect
95+
- name: Connect to Cluster
96+
id: client
97+
if: steps.cluster.outcome == 'success'
98+
shell: bash
99+
run: |
100+
set -euo pipefail
101+
echo "Installing GKE auth plugin..."
102+
gcloud components install gke-gcloud-auth-plugin --quiet --project ${{ env.TF_VAR_project_id }}
103+
echo "Getting cluster credentials..."
104+
gcloud container clusters get-credentials "${{ env.PREFIX }}-${{ env.TF_VAR_deployment_id }}" \
105+
--zone ${{ env.TF_VAR_zone }} --project ${{ env.TF_VAR_project_id }}
106+
107+
# Image Tag
108+
- name: Compute ref name with short SHA
109+
id: ref-name
110+
run: |
111+
if [[ "${{ github.ref_type }}" == "tag" ]]; then
112+
SAFE_REF="${{ github.ref_name }}"
113+
elif [[ "${{ github.ref_name }}" == "main" ]]; then
114+
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
115+
SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}"
116+
else
117+
SAFE_REF="${{ env.TEST_TAG }}"
118+
fi
119+
# Sanitize ref name: replace slashes with hyphens for Docker tag compatibility
120+
SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g')
121+
echo "value=$SAFE_REF" >> $GITHUB_OUTPUT
122+
123+
# Apps
124+
- name: Install NVS
125+
id: apps
126+
if: steps.client.outcome == 'success'
127+
shell: bash
128+
env:
129+
GCP_PROJECT_ID: "${{ env.PROJECT_ID }}"
130+
GCP_ZONE: "${{ env.TF_VAR_zone }}"
131+
GCP_SERVICE_ACCOUNT: "${{ env.SERVICE_ACCOUNT }}"
132+
NVSENTINEL_VERSION: "${{ steps.ref-name.outputs.value }}"
133+
run: |
134+
set -euxo pipefail
135+
tests/uat/install-apps.sh
136+
137+
# Test
138+
- name: Run UAT Tests
139+
id: tests
140+
if: steps.apps.outcome == 'success'
141+
shell: bash
142+
run: |
143+
set -euxo pipefail
144+
tests/uat/tests.sh
145+
146+
# Teardown
147+
- name: Destroy Cluster
148+
if: always() && steps.cluster.outcome != 'skipped' && env.SKIP_DELETE != 'true'
149+
shell: bash
150+
run: |
151+
set -euxo pipefail
152+
cd tests/uat/gcp/cluster
153+
terraform destroy -auto-approve
154+
155+
# Summary
156+
- name: Test Summary
157+
if: always()
158+
run: |
159+
echo "## Test Results" >> $GITHUB_STEP_SUMMARY
160+
echo "- Cluster: ${{ steps.cluster.outcome }}" >> $GITHUB_STEP_SUMMARY
161+
echo "- Connection: ${{ steps.client.outcome }}" >> $GITHUB_STEP_SUMMARY
162+
echo "- Apps: ${{ steps.apps.outcome }}" >> $GITHUB_STEP_SUMMARY
163+
echo "- Tests: ${{ steps.tests.outcome }}" >> $GITHUB_STEP_SUMMARY

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/serviceaccount.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,4 @@ kind: ServiceAccount
1717
metadata:
1818
name: {{ include "kubernetes-object-monitor.fullname" . }}
1919
labels:
20-
{{- include "kubernetes-object-monitor.labels" . | nindent 4 }}
21-
20+
{{- include "kubernetes-object-monitor.labels" . | nindent 4 }}

distros/kubernetes/nvsentinel/charts/metadata-collector/templates/daemonset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,4 @@ spec:
9292
tolerations:
9393
{{- toYaml . | nindent 8 }}
9494
{{- end }}
95-
95+
runtimeClassName: nvidia

tests/uat/gcp/.gitkeep

Whitespace-only changes.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
installCRDs: true
16+
17+
# Optimize for faster startup in Kind/test environments
18+
global:
19+
leaderElection:
20+
# Reduce leader election timeout for faster startup
21+
leaseDuration: 30s
22+
renewDeadline: 20s
23+
retryPeriod: 5s
24+
25+
# Reduce resource requests for Kind (local testing)
26+
resources:
27+
requests:
28+
cpu: 10m
29+
memory: 32Mi
30+
31+
webhook:
32+
# Reduce webhook resource requirements
33+
resources:
34+
requests:
35+
cpu: 10m
36+
memory: 32Mi
37+
# Faster readiness checks
38+
readinessProbe:
39+
initialDelaySeconds: 3
40+
periodSeconds: 3
41+
42+
cainjector:
43+
# Reduce cainjector resource requirements
44+
resources:
45+
requests:
46+
cpu: 10m
47+
memory: 32Mi
48+
49+
startupapicheck:
50+
# Reduce startup check resource requirements
51+
resources:
52+
requests:
53+
cpu: 10m
54+
memory: 32Mi

tests/uat/gcp/cluster/README.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# GKE Cluster Terraform Configuration
2+
3+
This Terraform configuration creates a GKE cluster with GPU nodes for NVSentinel testing.
4+
5+
- Single zone (zonal cluster)
6+
- GPU nodes use specific reservation affinity
7+
- Service account `gke-cluster-kubernetes@PROJECT_ID.iam.gserviceaccount.com` must exist
8+
9+
## Prerequisites
10+
11+
- [Terraform](https://www.terraform.io/downloads.html) `>= 1.9.5`
12+
- [gcloud CLI](https://cloud.google.com/sdk/docs/install) configured with appropriate credentials
13+
- GCP project with necessary APIs enabled:
14+
- Kubernetes Engine API
15+
- Compute Engine API
16+
17+
## Known Issues
18+
19+
⚠️ **Reservation Maintenance Interval Mismatch - RESOLVED**
20+
21+
**Previous Issue:** gSC (Google Supercomputer) reservations require instances to have `maintenanceInterval=PERIODIC`, but GKE created instances with `maintenanceInterval=MAINTENANCE_INTERVAL_UNSPECIFIED` by default.
22+
23+
**Solution:** The configuration now includes `host_maintenance_policy` block in the GPU node pool with `maintenance_interval = "PERIODIC"`, which resolves this issue.
24+
25+
```hcl
26+
host_maintenance_policy {
27+
maintenance_interval = "PERIODIC"
28+
}
29+
```
30+
31+
## Usage
32+
33+
1. **Initialize Terraform:**
34+
```bash
35+
terraform init
36+
```
37+
38+
2. **Configure variables (optional):**
39+
```bash
40+
cp terraform.tfvars.example terraform.tfvars
41+
# Edit terraform.tfvars with your values
42+
```
43+
44+
3. **Preview changes:**
45+
```bash
46+
terraform plan
47+
```
48+
49+
4. **Create the cluster:**
50+
```bash
51+
terraform apply
52+
```
53+
54+
5. **Get kubeconfig:**
55+
```bash
56+
gcloud container clusters get-credentials nvs-d2 --zone europe-west4-b --project nv-dgxck8s-20250306
57+
```
58+
59+
Or use the output command:
60+
```bash
61+
terraform output -raw kubeconfig_command | bash
62+
```
63+
64+
6. **Destroy the cluster:**
65+
```bash
66+
terraform destroy
67+
```
68+
69+
## Configuration Variables
70+
71+
| Variable | Description | Default |
72+
|----------|-------------|---------|
73+
| `deployment_id` | Deployment identifier for cluster naming | `d2` |
74+
| `project_id` | GCP project ID | `nv-dgxck8s-20250306` |
75+
| `zone` | GCP zone for the cluster | `europe-west4-b` |
76+
| `system_node_type` | Machine type for system nodes | `e2-standard-4` |
77+
| `system_node_count` | Number of system nodes | `3` |
78+
| `gpu_node_pool_name` | Name of the GPU node pool | `gpu-pool` |
79+
| `gpu_machine_type` | Machine type for GPU nodes | `a3-megagpu-8g` |
80+
| `gpu_node_count` | Number of GPU nodes | `1` |
81+
| `gpu_reservation_project` | Project containing GPU reservation | `nv-dgxcloudprodgsc-20240206` |
82+
| `gpu_reservation_name` | Name of GPU reservation | `gsc-a3-megagpu-8g-shared-res-2` |
83+
| `gpu_driver_version` | GPU driver installation mode | `INSTALLATION_DISABLED` |
84+
| `resource_labels` | Labels to apply to resources | `{}` |
85+
86+
87+
## Outputs
88+
89+
- `cluster_name`: Name of the created cluster
90+
- `cluster_location`: Zone where cluster is deployed
91+
- `cluster_endpoint`: API endpoint (sensitive)
92+
- `cluster_ca_certificate`: CA certificate (sensitive)
93+
- `gpu_node_pool_name`: Name of GPU node pool
94+
- `kubeconfig_command`: Command to configure kubectl

0 commit comments

Comments
 (0)