Skip to content

Commit e5cf668

Browse files
committed
Add MIG Reconfigure API for vgpu-device-manager
This change adds a mig/reconfigure package that implements the functionality added to the vgpu-device-manager. Signed-off-by: Evan Lezar <[email protected]>
1 parent 59303d4 commit e5cf668

File tree

5 files changed

+713
-0
lines changed

5 files changed

+713
-0
lines changed

pkg/mig/reconfigure/api.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package reconfigure
2+
3+
const (
4+
MIGConfigStateLabel = "nvidia.com/mig.config.state"
5+
VGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
6+
)
7+
8+
// A Reconfigurer applies applies applies the specified config.
9+
type Reconfigurer interface {
10+
Reconfigure(string, string) error
11+
}

pkg/mig/reconfigure/find.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
# Copyright 2024 NVIDIA CORPORATION
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
*/
16+
17+
// TODO: This should be pulled in from nvidia-container-toolkit or go-nvlib.
18+
19+
package reconfigure
20+
21+
import (
22+
"fmt"
23+
"path/filepath"
24+
)
25+
26+
type root string
27+
28+
// getDriverLibraryPath returns path to `libnvidia-ml.so.1` in the driver root.
29+
// The folder for this file is also expected to be the location of other driver files.
30+
func (r root) getDriverLibraryPath() (string, error) {
31+
librarySearchPaths := []string{
32+
"/usr/lib64",
33+
"/usr/lib/x86_64-linux-gnu",
34+
"/usr/lib/aarch64-linux-gnu",
35+
"/lib64",
36+
"/lib/x86_64-linux-gnu",
37+
"/lib/aarch64-linux-gnu",
38+
}
39+
40+
libraryPath, err := r.findFile("libnvidia-ml.so.1", librarySearchPaths...)
41+
if err != nil {
42+
return "", err
43+
}
44+
45+
return libraryPath, nil
46+
}
47+
48+
// getNvidiaSMIPath returns path to the `nvidia-smi` executable in the driver root.
49+
func (r root) getNvidiaSMIPath() (string, error) {
50+
binarySearchPaths := []string{
51+
"/usr/bin",
52+
"/usr/sbin",
53+
"/bin",
54+
"/sbin",
55+
}
56+
57+
binaryPath, err := r.findFile("nvidia-smi", binarySearchPaths...)
58+
if err != nil {
59+
return "", err
60+
}
61+
62+
return binaryPath, nil
63+
}
64+
65+
// findFile searches the root for a specified file.
66+
// A number of folders can be specified to search in addition to the root itself.
67+
// If the file represents a symlink, this is resolved and the final path is returned.
68+
func (r root) findFile(name string, searchIn ...string) (string, error) {
69+
70+
for _, d := range append([]string{"/"}, searchIn...) {
71+
l := filepath.Join(string(r), d, name)
72+
candidate, err := resolveLink(l)
73+
if err != nil {
74+
continue
75+
}
76+
return candidate, nil
77+
}
78+
79+
return "", fmt.Errorf("error locating %q", name)
80+
}
81+
82+
// resolveLink finds the target of a symlink or the file itself in the
83+
// case of a regular file.
84+
// This is equivalent to running `readlink -f ${l}`.
85+
func resolveLink(l string) (string, error) {
86+
resolved, err := filepath.EvalSymlinks(l)
87+
if err != nil {
88+
return "", fmt.Errorf("error resolving link '%s': %w", l, err)
89+
}
90+
return resolved, nil
91+
}

pkg/mig/reconfigure/options.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
package reconfigure
2+
3+
import (
4+
"k8s.io/client-go/kubernetes"
5+
)
6+
7+
// An Option represents a functional option passed to the constructor.
8+
type Option func(*options)
9+
10+
// reconfigureMIGOptions contains configuration options for reconfiguring MIG
11+
// settings on a Kubernetes node. This struct is used to manage the various
12+
// parameters required for applying MIG configurations through mig-parted, including node identification, configuration files, reboot behavior, and host
13+
// system service management.
14+
type reconfigureMIGOptions struct {
15+
// NodeName is the kubernetes node to change the MIG configuration on.
16+
// Its validation follows the RFC 1123 standard for DNS subdomain names.
17+
// Source: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-subdomain-names
18+
// NodeName string `validate:"required,hostname_rfc1123"`
19+
20+
// MIGPartedConfigFile is the mig-parted configuration file path.
21+
// Deprecated: Pass the config file as an argument.
22+
MIGPartedConfigFile string `validate:"required,filepath"`
23+
24+
// SelectedMIGConfig is the selected mig-parted configuration to apply to the
25+
// node.
26+
// Deprecated: Pass the selected config as an argument.
27+
SelectedMIGConfig string
28+
29+
// DriverLibrayPath is the path to libnvidia-ml.so.1 in the container.
30+
DriverLibraryPath string `validate:"required,filepath"`
31+
32+
// WithReboot reboots the node if changing the MIG mode fails for any reason.
33+
WithReboot bool
34+
35+
// WithShutdownHostGPUClients shutdowns/restarts any required host GPU clients
36+
// across a MIG configuration.
37+
WithShutdownHostGPUClients bool
38+
39+
// HostRootMount is the container path where host root directory is mounted.
40+
HostRootMount string `validate:"dirpath"`
41+
42+
// HostMIGManagerStateFile is the path where the systemd mig-manager state
43+
// file is located.
44+
HostMIGManagerStateFile string `validate:"filepath"`
45+
46+
// HostGPUClientServices is a comma separated list of host systemd services to
47+
// shutdown/restart across a MIG reconfiguration.
48+
HostGPUClientServices []string `validate:"dive,systemd_service_name"`
49+
50+
// HostKubeletService is the name of the host's 'kubelet' systemd service
51+
// which may need to be shutdown/restarted across a MIG mode reconfiguration.
52+
HostKubeletService string `validate:"systemd_service_name"`
53+
54+
configStateLabel string
55+
}
56+
57+
type manager struct {
58+
clientset *kubernetes.Clientset
59+
nodeName string
60+
}
61+
62+
type options struct {
63+
manager
64+
65+
driverRoot root
66+
67+
reconfigureMIGOptions
68+
}
69+
70+
func WithClientset(clientset *kubernetes.Clientset) Option {
71+
return func(o *options) {
72+
o.clientset = clientset
73+
}
74+
}
75+
76+
func WithNodeName(nodeName string) Option {
77+
return func(o *options) {
78+
o.nodeName = nodeName
79+
}
80+
}
81+
82+
func WithDriverRoot[T string | root](driverRoot T) Option {
83+
return func(o *options) {
84+
o.driverRoot = root(driverRoot)
85+
}
86+
}
87+
88+
func WithDriverLibraryPath(driverLibraryPath string) Option {
89+
return func(o *options) {
90+
o.DriverLibraryPath = driverLibraryPath
91+
}
92+
}
93+
94+
func WithShutdownHostGPUClients(shutdownHostGPUClients bool) Option {
95+
return func(o *options) {
96+
o.WithShutdownHostGPUClients = shutdownHostGPUClients
97+
}
98+
}
99+
100+
func WithHostGPUClientServices(hostGPUClientServices ...string) Option {
101+
return func(o *options) {
102+
o.HostGPUClientServices = append([]string{}, hostGPUClientServices...)
103+
}
104+
}
105+
106+
func WithHostKubeletService(hostKubeletService string) Option {
107+
return func(o *options) {
108+
o.HostKubeletService = hostKubeletService
109+
}
110+
}
111+
112+
func WithHostMIGManagerStateFile(hostMIGManagerStateFile string) Option {
113+
return func(o *options) {
114+
o.HostMIGManagerStateFile = hostMIGManagerStateFile
115+
}
116+
}
117+
118+
func WithHostRootMount(hostRootMount string) Option {
119+
return func(o *options) {
120+
o.HostRootMount = hostRootMount
121+
}
122+
}
123+
124+
func WithAllowReboot(allowReboot bool) Option {
125+
return func(o *options) {
126+
o.WithReboot = allowReboot
127+
}
128+
}
129+
130+
func WithConfigStateLabel(configStateLabel string) Option {
131+
return func(o *options) {
132+
o.configStateLabel = configStateLabel
133+
}
134+
}

pkg/mig/reconfigure/reconfigure.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package reconfigure
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
8+
)
9+
10+
func New(opts ...Option) (Reconfigurer, error) {
11+
o := &options{}
12+
13+
for _, opt := range opts {
14+
opt(o)
15+
}
16+
17+
if o.clientset == nil {
18+
return nil, fmt.Errorf("a k8s clientset is required")
19+
}
20+
if o.nodeName == "" {
21+
return nil, fmt.Errorf("a node name is required")
22+
}
23+
if o.configStateLabel == "" {
24+
return nil, fmt.Errorf("a config state label must be specified")
25+
}
26+
27+
// TODO: Add validation.
28+
29+
return o, nil
30+
}
31+
32+
func (o *options) Reconfigure(migPartedConfigFile string, selectedMIGConfig string) error {
33+
// TODO: These should be passed as arguments.
34+
o.reconfigureMIGOptions.MIGPartedConfigFile = migPartedConfigFile
35+
o.reconfigureMIGOptions.SelectedMIGConfig = selectedMIGConfig
36+
37+
return o.reconfigureMIG(&o.reconfigureMIGOptions)
38+
}
39+
40+
func (m *manager) getNodeLabelValue(label string) (string, error) {
41+
node, err := m.clientset.CoreV1().Nodes().Get(context.TODO(), m.nodeName, metav1.GetOptions{})
42+
if err != nil {
43+
return "", fmt.Errorf("unable to get node object: %v", err)
44+
}
45+
46+
value, ok := node.Labels[label]
47+
if !ok {
48+
return "", nil
49+
}
50+
51+
return value, nil
52+
}
53+
54+
func (m *manager) setNodeLabelValue(label, value string) error {
55+
node, err := m.clientset.CoreV1().Nodes().Get(context.TODO(), m.nodeName, metav1.GetOptions{})
56+
if err != nil {
57+
return fmt.Errorf("unable to get node object: %v", err)
58+
}
59+
60+
labels := node.GetLabels()
61+
labels[label] = value
62+
node.SetLabels(labels)
63+
_, err = m.clientset.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{})
64+
if err != nil {
65+
return fmt.Errorf("unable to update node object: %v", err)
66+
}
67+
68+
return nil
69+
}

0 commit comments

Comments
 (0)