Skip to content

Commit 14a7880

Browse files
Merge pull request #432 from ArangoGutierrez/get_ip
Fix containerd and CNI install
2 parents bb880fd + 35da471 commit 14a7880

File tree

9 files changed

+235
-70
lines changed

9 files changed

+235
-70
lines changed

cmd/cli/create/create.go

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
package create
1818

1919
import (
20+
"bufio"
2021
"fmt"
2122
"os"
23+
"strings"
2224

2325
"github.com/NVIDIA/holodeck/api/holodeck/v1alpha1"
2426
"github.com/NVIDIA/holodeck/internal/instances"
@@ -167,14 +169,72 @@ func (m command) run(c *cli.Context, opts *options) error {
167169
if opts.provision {
168170
err := runProvision(m.log, opts)
169171
if err != nil {
170-
return fmt.Errorf("failed to provision: %v", err)
172+
// Handle provisioning failure with user interaction
173+
return m.handleProvisionFailure(instanceID, opts.cachePath, err)
171174
}
172175
}
173176

174177
m.log.Info("\nCreated instance %s", instanceID)
175178
return nil
176179
}
177180

181+
func (m *command) handleProvisionFailure(instanceID, cachePath string, provisionErr error) error {
182+
m.log.Info("\n❌ Provisioning failed: %v\n", provisionErr)
183+
184+
// Check if we're in a non-interactive environment
185+
if os.Getenv("CI") == "true" || os.Getenv("HOLODECK_NONINTERACTIVE") == "true" {
186+
m.log.Info("\n💡 To clean up the failed instance, run:")
187+
m.log.Info(" holodeck delete %s\n", instanceID)
188+
m.log.Info("💡 To list all instances:")
189+
m.log.Info(" holodeck list\n")
190+
return fmt.Errorf("provisioning failed: %w", provisionErr)
191+
}
192+
193+
// Ask user if they want to delete the failed instance
194+
reader := bufio.NewReader(os.Stdin)
195+
m.log.Info("\n❓ Would you like to delete the failed instance? (y/N): ")
196+
197+
response, err := reader.ReadString('\n')
198+
if err != nil {
199+
m.log.Info("Failed to read user input: %v", err)
200+
return m.provideCleanupInstructions(instanceID, provisionErr)
201+
}
202+
203+
response = strings.TrimSpace(strings.ToLower(response))
204+
205+
if response == "y" || response == "yes" {
206+
// Delete the instance
207+
manager := instances.NewManager(m.log, cachePath)
208+
if err := manager.DeleteInstance(instanceID); err != nil {
209+
m.log.Info("Failed to delete instance: %v", err)
210+
return m.provideCleanupInstructions(instanceID, provisionErr)
211+
}
212+
213+
m.log.Info("✅ Successfully deleted failed instance %s\n", instanceID)
214+
return fmt.Errorf("provisioning failed and instance was deleted: %w", provisionErr)
215+
}
216+
217+
return m.provideCleanupInstructions(instanceID, provisionErr)
218+
}
219+
220+
func (m *command) provideCleanupInstructions(instanceID string, provisionErr error) error {
221+
m.log.Info("\n💡 The instance was created but provisioning failed.")
222+
m.log.Info(" You can manually investigate or clean up using the following commands:\n")
223+
m.log.Info(" To delete this specific instance:")
224+
m.log.Info(" holodeck delete %s\n", instanceID)
225+
m.log.Info(" To list all instances:")
226+
m.log.Info(" holodeck list\n")
227+
m.log.Info(" To see instance details:")
228+
m.log.Info(" holodeck status %s\n", instanceID)
229+
230+
m.log.Info("\n💡 Additional debugging tips:")
231+
m.log.Info(" - Review the provisioning logs above for specific errors")
232+
m.log.Info(" - Check cloud provider console for instance status")
233+
m.log.Info(" - SSH into the instance to investigate further")
234+
235+
return fmt.Errorf("provisioning failed: %w", provisionErr)
236+
}
237+
178238
func runProvision(log *logger.FunLogger, opts *options) error {
179239
var hostUrl string
180240

cmd/cli/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Examples:
6666
holodeck status <instance-id>
6767
6868
# Delete an environment
69-
holodeck delete -i <instance-id>
69+
holodeck delete <instance-id>
7070
7171
# Use a custom cache directory
7272
holodeck --cachepath /path/to/cache create -f env.yaml`

pkg/provisioner/templates/container-toolkit.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,18 @@ install_packages_with_retry nvidia-container-toolkit nvidia-container-toolkit-ba
3939
4040
# Configure container runtime
4141
sudo nvidia-ctk runtime configure --runtime={{.ContainerRuntime}} --set-as-default --enable-cdi={{.EnableCDI}}
42+
43+
# Verify CNI configuration is preserved after nvidia-ctk
44+
echo "Verifying CNI configuration after nvidia-ctk..."
45+
if [ "{{.ContainerRuntime}}" = "containerd" ]; then
46+
if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then
47+
echo "WARNING: CNI bin_dir configuration may have been modified by nvidia-ctk"
48+
echo "Restoring CNI paths..."
49+
# This is a safeguard, but nvidia-ctk should preserve existing CNI config
50+
sudo sed -i '/\[plugins."io.containerd.grpc.v1.cri".cni\]/,/\[/{s|bin_dir = .*|bin_dir = "/opt/cni/bin:/usr/libexec/cni"|g}' /etc/containerd/config.toml
51+
fi
52+
fi
53+
4254
sudo systemctl restart {{.ContainerRuntime}}
4355
`
4456

pkg/provisioner/templates/container-toolkit_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
/*
2+
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
116
package templates
217

318
import (

pkg/provisioner/templates/containerd.go

Lines changed: 64 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -199,26 +199,19 @@ sudo tar Cxzvf /opt/cni/bin ${CNI_TAR}
199199
# Configure containerd
200200
sudo mkdir -p /etc/containerd
201201
202-
# Generate base configuration
203-
sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null
204-
205-
# Configure based on version
206-
if [ "$MAJOR_VERSION" = "2" ]; then
207-
# Containerd 2.x configuration
208-
cat <<EOF | sudo tee /etc/containerd/config.toml > /dev/null
202+
# Create unified configuration that works for both 1.x and 2.x
203+
# Start with a minimal config and add only what's needed
204+
cat <<'EOF' | sudo tee /etc/containerd/config.toml > /dev/null
205+
# /etc/containerd/config.toml (managed by Holodeck)
209206
version = 2
210-
root = "/var/lib/containerd"
211-
state = "/run/containerd"
212-
213-
[grpc]
214-
address = "/run/containerd/containerd.sock"
215-
uid = 0
216-
gid = 0
217207
218208
[plugins]
219209
[plugins."io.containerd.grpc.v1.cri"]
220210
sandbox_image = "registry.k8s.io/pause:3.9"
221-
systemd_cgroup = true
211+
[plugins."io.containerd.grpc.v1.cri".cni]
212+
# Include both locations to survive distro variance
213+
bin_dir = "/opt/cni/bin:/usr/libexec/cni"
214+
conf_dir = "/etc/cni/net.d"
222215
[plugins."io.containerd.grpc.v1.cri".containerd]
223216
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
224217
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
@@ -229,35 +222,18 @@ state = "/run/containerd"
229222
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
230223
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
231224
endpoint = ["https://registry-1.docker.io"]
232-
EOF
233-
else
234-
# Containerd 1.x configuration
235-
cat <<EOF | sudo tee /etc/containerd/config.toml > /dev/null
236-
version = 1
237-
root = "/var/lib/containerd"
238-
state = "/run/containerd"
239225
240226
[grpc]
241227
address = "/run/containerd/containerd.sock"
242-
uid = 0
243-
gid = 0
244-
245-
[plugins]
246-
[plugins."io.containerd.grpc.v1.cri"]
247-
sandbox_image = "registry.k8s.io/pause:3.9"
248-
systemd_cgroup = true
249-
[plugins."io.containerd.grpc.v1.cri".containerd]
250-
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
251-
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
252-
runtime_type = "io.containerd.runtime.v1.linux"
253-
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
254-
SystemdCgroup = true
255-
[plugins."io.containerd.grpc.v1.cri".registry]
256-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
257-
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
258-
endpoint = ["https://registry-1.docker.io"]
259228
EOF
260-
fi
229+
230+
# Ensure CNI directories exist
231+
sudo mkdir -p /etc/cni/net.d
232+
sudo mkdir -p /opt/cni/bin
233+
234+
# Ensure containerd directories exist
235+
sudo mkdir -p /var/lib/containerd
236+
sudo mkdir -p /run/containerd
261237
262238
# Set up systemd service for containerd
263239
sudo curl -fsSL "https://raw.githubusercontent.com/containerd/containerd/main/containerd.service" -o /etc/systemd/system/containerd.service
@@ -284,9 +260,22 @@ ExecStartPre=/bin/mkdir -p /run/containerd
284260
ExecStartPre=/bin/chmod 711 /run/containerd
285261
EOF
286262
263+
# Ensure containerd is not running with stale config
264+
sudo systemctl stop containerd || true
265+
287266
# Reload systemd and start containerd
288267
sudo systemctl daemon-reload
289-
sudo systemctl enable --now containerd
268+
echo "Starting containerd service..."
269+
if ! sudo systemctl enable --now containerd; then
270+
echo "ERROR: Failed to start containerd service"
271+
echo "Checking service status..."
272+
sudo systemctl status containerd || true
273+
echo "Checking journal logs..."
274+
sudo journalctl -xeu containerd -n 50 || true
275+
echo "Checking config file syntax..."
276+
sudo containerd config dump || true
277+
exit 1
278+
fi
290279
291280
# Wait for containerd to be ready
292281
timeout=60
@@ -307,11 +296,43 @@ containerd --version
307296
runc --version
308297
sudo ctr version
309298
299+
# Verify CNI configuration
300+
echo "Verifying containerd CNI configuration..."
301+
if ! sudo grep -q 'bin_dir = "/opt/cni/bin:/usr/libexec/cni"' /etc/containerd/config.toml; then
302+
echo "ERROR: CNI bin_dir not properly configured in containerd"
303+
exit 1
304+
fi
305+
306+
if ! sudo grep -q 'conf_dir = "/etc/cni/net.d"' /etc/containerd/config.toml; then
307+
echo "ERROR: CNI conf_dir not properly configured in containerd"
308+
exit 1
309+
fi
310+
311+
if ! sudo grep -q 'SystemdCgroup = true' /etc/containerd/config.toml; then
312+
echo "ERROR: SystemdCgroup not enabled in containerd config"
313+
exit 1
314+
fi
315+
316+
# Verify with crictl
317+
if command -v crictl &> /dev/null; then
318+
echo "Checking CRI configuration..."
319+
sudo crictl info | grep -E "cni|Cni" || true
320+
fi
321+
322+
# Note about nvidia-container-toolkit compatibility
323+
echo ""
324+
echo "Note: This containerd configuration is designed to be compatible with nvidia-container-toolkit."
325+
echo "When nvidia-ctk runtime configure is run later, it will:"
326+
echo " - Add nvidia runtime configuration"
327+
echo " - Preserve our CNI settings (bin_dir and conf_dir)"
328+
echo " - May change default_runtime_name to 'nvidia'"
329+
echo "This is expected and will not affect CNI functionality."
330+
310331
# Test containerd functionality
311332
sudo ctr images pull docker.io/library/hello-world:latest
312333
sudo ctr run --rm docker.io/library/hello-world:latest test
313334
314-
# Containerd installation completed successfully!
335+
echo "Containerd installation and CNI configuration completed successfully!"
315336
`
316337

317338
type Containerd struct {
@@ -322,7 +343,7 @@ func NewContainerd(env v1alpha1.Environment) *Containerd {
322343
var version string
323344

324345
if env.Spec.ContainerRuntime.Version == "" {
325-
version = "1.7.26"
346+
version = "1.7.28"
326347
} else {
327348
// remove the 'v' prefix from the version if it exists
328349
version = strings.TrimPrefix(env.Spec.ContainerRuntime.Version, "v")

0 commit comments

Comments
 (0)