From 1042bce38159041df1ddc4d31faa929c6803e58b Mon Sep 17 00:00:00 2001 From: Sneha Aradhey Date: Wed, 26 Mar 2025 19:47:52 +0000 Subject: [PATCH] use node UID for VG creation instead of name --- cmd/gce-pd-csi-driver/main.go | 45 ++++++++++++------- .../base/controller/controller.yaml | 3 +- pkg/gce-pd-csi-driver/cache.go | 26 +++++------ 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index 0cb6baf3e..5c79bfdbf 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -27,6 +27,7 @@ import ( "strings" "time" + v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" "k8s.io/utils/strings/slices" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" @@ -245,15 +246,26 @@ func handle() { if err != nil { klog.Fatalf("Failed to set up metadata service: %v", err.Error()) } - isDataCacheEnabledNodePool, err := isDataCacheEnabledNodePool(ctx, *nodeName) - if err != nil { - klog.Fatalf("Failed to get node info from API server: %v", err.Error()) + var node *v1.Node + var isDataCacheEnabledNodePoolCheck bool + if *nodeName == common.TestNode { + isDataCacheEnabledNodePoolCheck = true + } else if len(*nodeName) > 0 && *nodeName != common.TestNode { + node, err = driver.FetchNodeWithRetry(ctx, *nodeName) + if err != nil { + klog.Fatalf("Failed to get node info from API server: %v", err.Error()) + } + isDataCacheEnabledNodePoolCheck, err = isDataCacheEnabledNodePool(ctx, node) + if err != nil { + klog.Fatalf("Unable to fetch node labels: %v", err.Error()) + } } + // isDataCacheEnabledNodePool := true nsArgs := driver.NodeServerArgs{ EnableDeviceInUseCheck: *enableDeviceInUseCheck, DeviceInUseTimeout: *deviceInUseTimeout, EnableDataCache: *enableDataCacheFlag, - DataCacheEnabledNodePool: isDataCacheEnabledNodePool, + DataCacheEnabledNodePool: isDataCacheEnabledNodePoolCheck, } nodeServer = driver.NewNodeServer(gceDriver, mounter, deviceUtils, meta, statter, nsArgs) if *maxConcurrentFormatAndMount > 0 { @@ -264,7 +276,7 @@ func handle() { klog.Errorf("Data Cache enabled, but --node-name not passed") } if nsArgs.DataCacheEnabledNodePool { - if err := setupDataCache(ctx, *nodeName, nodeServer.MetadataService.GetName()); err != nil { + if err := setupDataCache(ctx, node, *nodeName); err != nil { klog.Errorf("Data Cache setup failed: %v", err) } go driver.StartWatcher(*nodeName) @@ -351,15 +363,16 @@ func urlFlag(target **url.URL, name string, usage string) { }) } -func isDataCacheEnabledNodePool(ctx context.Context, nodeName string) (bool, error) { +func isDataCacheEnabledNodePool(ctx context.Context, node *v1.Node) (bool, error) { if !*enableDataCacheFlag { return false, nil } - if len(nodeName) > 0 && nodeName != common.TestNode { // disregard logic below when E2E testing. - dataCacheLSSDCount, err := driver.GetDataCacheCountFromNodeLabel(ctx, nodeName) - return dataCacheLSSDCount != 0, err - } - return true, nil + // nodeName := node.Name + // if len(nodeName) > 0 && nodeName != common.TestNode { // disregard logic below when E2E testing. + dataCacheLSSDCount, err := driver.GetDataCacheCountFromNodeLabel(ctx, node) + return dataCacheLSSDCount != 0, err + // } + // return true, nil } func fetchLssdsForRaiding(lssdCount int) ([]string, error) { @@ -394,7 +407,7 @@ func fetchLssdsForRaiding(lssdCount int) ([]string, error) { return availableLssds[:lssdCount], nil } -func setupDataCache(ctx context.Context, nodeName string, nodeId string) error { +func setupDataCache(ctx context.Context, node *v1.Node, nodeName string) error { isAlreadyRaided, err := driver.IsRaided() if err != nil { klog.V(4).Infof("Errored while scanning for available LocalSSDs err:%v; continuing Raiding", err) @@ -404,9 +417,11 @@ func setupDataCache(ctx context.Context, nodeName string, nodeId string) error { } lssdCount := common.LocalSSDCountForDataCache + nodeUid := nodeName if nodeName != common.TestNode { - var err error - lssdCount, err = driver.GetDataCacheCountFromNodeLabel(ctx, nodeName) + nodeUid = string(node.ObjectMeta.UID) + // lssdCount := 4 + lssdCount, err = driver.GetDataCacheCountFromNodeLabel(ctx, node) if err != nil { return err } @@ -425,7 +440,7 @@ func setupDataCache(ctx context.Context, nodeName string, nodeId string) error { } // Initializing data cache node (VG checks w/ raided lssd) - if err := driver.InitializeDataCacheNode(nodeId); err != nil { + if err := driver.InitializeDataCacheNode(nodeUid); err != nil { return err } diff --git a/deploy/kubernetes/base/controller/controller.yaml b/deploy/kubernetes/base/controller/controller.yaml index 2a37539d4..2a5fa4043 100644 --- a/deploy/kubernetes/base/controller/controller.yaml +++ b/deploy/kubernetes/base/controller/controller.yaml @@ -143,7 +143,8 @@ spec: - "--endpoint=unix:/csi/csi.sock" - "--supports-dynamic-iops-provisioning=hyperdisk-balanced,hyperdisk-extreme" - "--supports-dynamic-throughput-provisioning=hyperdisk-balanced,hyperdisk-throughput,hyperdisk-ml" - - --enable-data-cache + - "--run-node-service=false" + - --enable-data-cache=true command: - /gce-pd-csi-driver env: diff --git a/pkg/gce-pd-csi-driver/cache.go b/pkg/gce-pd-csi-driver/cache.go index 32ef0faa0..e6aaf91bb 100644 --- a/pkg/gce-pd-csi-driver/cache.go +++ b/pkg/gce-pd-csi-driver/cache.go @@ -248,19 +248,7 @@ func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx con return fmt.Errorf("Data Cache is not enabled for PVC (data-cache-size: %v, data-cache-mode: %v). Please set both parameters in StorageClass to enable caching", dataCacheSize, dataCacheMode) } -func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int, error) { - cfg, err := rest.InClusterConfig() - if err != nil { - return 0, err - } - kubeClient, err := kubernetes.NewForConfig(cfg) - if err != nil { - return 0, err - } - node, err := getNodeWithRetry(ctx, kubeClient, nodeName) - if err != nil { - return 0, err - } +func GetDataCacheCountFromNodeLabel(ctx context.Context, node *v1.Node) (int, error) { if val, found := node.GetLabels()[fmt.Sprintf(common.NodeLabelPrefix, common.DataCacheLssdCountLabel)]; found { dataCacheCount, err := strconv.Atoi(val) if err != nil { @@ -272,14 +260,22 @@ func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int, return 0, nil } -func getNodeWithRetry(ctx context.Context, kubeClient *kubernetes.Clientset, nodeName string) (*v1.Node, error) { +func FetchNodeWithRetry(ctx context.Context, nodeName string) (*v1.Node, error) { var nodeObj *v1.Node + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, err + } + kubeClient, err := kubernetes.NewForConfig(cfg) + if err != nil { + return nil, err + } backoff := wait.Backoff{ Duration: 1 * time.Second, Factor: 2.0, Steps: 5, } - err := wait.ExponentialBackoffWithContext(ctx, backoff, func(_ context.Context) (bool, error) { + err = wait.ExponentialBackoffWithContext(ctx, backoff, func(_ context.Context) (bool, error) { node, err := kubeClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) if err != nil { klog.Warningf("Error getting node %s: %v, retrying...\n", nodeName, err)