Skip to content

Commit 07ebd7d

Browse files
committed
Infer prometheus url
1 parent 8f68ef2 commit 07ebd7d

File tree

3 files changed

+324
-3
lines changed

3 files changed

+324
-3
lines changed

pkg/operator/operands/scheduler/resources_for_shard.go

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1"
2424
kaiConfigUtils "github.com/NVIDIA/KAI-scheduler/pkg/operator/config"
2525
"github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common"
26+
usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
2627
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/conf"
2728
)
2829

@@ -192,9 +193,11 @@ func (s *SchedulerForShard) configMapForShard(
192193
innerConfig.QueueDepthPerAction = shard.Spec.QueueDepthPerAction
193194
}
194195

195-
if shard.Spec.UsageDBConfig != nil {
196-
innerConfig.UsageDBConfig = shard.Spec.UsageDBConfig
196+
usageDBConfig, err := getUsageDBConfig(shard, kaiConfig)
197+
if err != nil {
198+
return nil, err
197199
}
200+
innerConfig.UsageDBConfig = usageDBConfig
198201

199202
data, marshalErr := yaml.Marshal(&innerConfig)
200203
if marshalErr != nil {
@@ -216,6 +219,41 @@ func validateJobDepthMap(shard *kaiv1.SchedulingShard, innerConfig conf.Schedule
216219
return nil
217220
}
218221

222+
func getUsageDBConfig(shard *kaiv1.SchedulingShard, kaiConfig *kaiv1.Config) (*usagedbapi.UsageDBConfig, error) {
223+
// Check for nil inputs
224+
if shard == nil {
225+
return nil, fmt.Errorf("shard cannot be nil")
226+
}
227+
if kaiConfig == nil {
228+
return nil, fmt.Errorf("kaiConfig cannot be nil")
229+
}
230+
231+
if shard.Spec.UsageDBConfig == nil {
232+
return nil, nil
233+
}
234+
235+
usageDBConfig := shard.Spec.UsageDBConfig.DeepCopy()
236+
237+
if usageDBConfig.ClientType != "prometheus" {
238+
return usageDBConfig, nil
239+
}
240+
241+
if usageDBConfig.ConnectionString == "" && usageDBConfig.ConnectionStringEnvVar == "" {
242+
// Use prometheus from config
243+
if kaiConfig.Spec.Prometheus != nil &&
244+
kaiConfig.Spec.Prometheus.Enabled != nil &&
245+
*kaiConfig.Spec.Prometheus.Enabled {
246+
usageDBConfig.ConnectionString = fmt.Sprintf("http://prometheus-operated.%s.svc.cluster.local:9090", kaiConfig.Spec.Namespace)
247+
} else if kaiConfig.Spec.Global != nil && kaiConfig.Spec.Global.ExternalTSDBConnection != nil && kaiConfig.Spec.Global.ExternalTSDBConnection.URL != nil {
248+
usageDBConfig.ConnectionString = *kaiConfig.Spec.Global.ExternalTSDBConnection.URL
249+
} else {
250+
return nil, fmt.Errorf("prometheus connection string not configured: either enable internal prometheus or configure external TSDB connection URL")
251+
}
252+
}
253+
254+
return usageDBConfig, nil
255+
}
256+
219257
func (s *SchedulerForShard) serviceForShard(
220258
ctx context.Context, readerClient client.Reader,
221259
kaiConfig *kaiv1.Config, shard *kaiv1.SchedulingShard,

pkg/operator/operands/scheduler/resources_test.go

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
"github.com/NVIDIA/KAI-scheduler/cmd/scheduler/app/options"
1616
kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1"
17+
kaiprometheus "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/prometheus"
1718
kaiv1qc "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/queue_controller"
1819
kaiv1scheduler "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/scheduler"
1920
usagedbapi "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
@@ -719,3 +720,285 @@ usageDBConfig:
719720
err = yaml.Unmarshal([]byte(configString), config)
720721
assert.NoError(t, err)
721722
}
723+
724+
func TestGetUsageDBConfig(t *testing.T) {
725+
tests := []struct {
726+
name string
727+
shard *kaiv1.SchedulingShard
728+
kaiConfig *kaiv1.Config
729+
expectError bool
730+
errorMsg string
731+
validate func(t *testing.T, result *usagedbapi.UsageDBConfig)
732+
}{
733+
{
734+
name: "nil shard",
735+
shard: nil,
736+
kaiConfig: &kaiv1.Config{},
737+
expectError: true,
738+
errorMsg: "shard cannot be nil",
739+
},
740+
{
741+
name: "nil kaiConfig",
742+
shard: &kaiv1.SchedulingShard{},
743+
kaiConfig: nil,
744+
expectError: true,
745+
errorMsg: "kaiConfig cannot be nil",
746+
},
747+
{
748+
name: "nil UsageDBConfig",
749+
shard: &kaiv1.SchedulingShard{
750+
Spec: kaiv1.SchedulingShardSpec{
751+
UsageDBConfig: nil,
752+
},
753+
},
754+
kaiConfig: &kaiv1.Config{},
755+
expectError: false,
756+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
757+
assert.Nil(t, result)
758+
},
759+
},
760+
{
761+
name: "non-prometheus client type",
762+
shard: &kaiv1.SchedulingShard{
763+
Spec: kaiv1.SchedulingShardSpec{
764+
UsageDBConfig: &usagedbapi.UsageDBConfig{
765+
ClientType: "custom",
766+
ConnectionString: "http://custom-db:9090",
767+
},
768+
},
769+
},
770+
kaiConfig: &kaiv1.Config{},
771+
expectError: false,
772+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
773+
assert.NotNil(t, result)
774+
assert.Equal(t, "custom", result.ClientType)
775+
assert.Equal(t, "http://custom-db:9090", result.ConnectionString)
776+
},
777+
},
778+
{
779+
name: "prometheus with explicit connection string",
780+
shard: &kaiv1.SchedulingShard{
781+
Spec: kaiv1.SchedulingShardSpec{
782+
UsageDBConfig: &usagedbapi.UsageDBConfig{
783+
ClientType: "prometheus",
784+
ConnectionString: "http://external-prometheus:9090",
785+
},
786+
},
787+
},
788+
kaiConfig: &kaiv1.Config{},
789+
expectError: false,
790+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
791+
assert.NotNil(t, result)
792+
assert.Equal(t, "prometheus", result.ClientType)
793+
assert.Equal(t, "http://external-prometheus:9090", result.ConnectionString)
794+
},
795+
},
796+
{
797+
name: "prometheus with internal prometheus enabled",
798+
shard: &kaiv1.SchedulingShard{
799+
Spec: kaiv1.SchedulingShardSpec{
800+
UsageDBConfig: &usagedbapi.UsageDBConfig{
801+
ClientType: "prometheus",
802+
},
803+
},
804+
},
805+
kaiConfig: &kaiv1.Config{
806+
Spec: kaiv1.ConfigSpec{
807+
Namespace: "kai-system",
808+
Prometheus: &kaiprometheus.Prometheus{
809+
Enabled: ptr.To(true),
810+
},
811+
},
812+
},
813+
expectError: false,
814+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
815+
assert.NotNil(t, result)
816+
assert.Equal(t, "prometheus", result.ClientType)
817+
assert.Equal(t, "http://prometheus-operated.kai-system.svc.cluster.local:9090", result.ConnectionString)
818+
},
819+
},
820+
{
821+
name: "prometheus with external TSDB connection",
822+
shard: &kaiv1.SchedulingShard{
823+
Spec: kaiv1.SchedulingShardSpec{
824+
UsageDBConfig: &usagedbapi.UsageDBConfig{
825+
ClientType: "prometheus",
826+
},
827+
},
828+
},
829+
kaiConfig: &kaiv1.Config{
830+
Spec: kaiv1.ConfigSpec{
831+
Namespace: "kai-system",
832+
Global: &kaiv1.GlobalConfig{
833+
ExternalTSDBConnection: &kaiv1.Connection{
834+
URL: ptr.To("http://external-tsdb:9090"),
835+
},
836+
},
837+
},
838+
},
839+
expectError: false,
840+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
841+
assert.NotNil(t, result)
842+
assert.Equal(t, "prometheus", result.ClientType)
843+
assert.Equal(t, "http://external-tsdb:9090", result.ConnectionString)
844+
},
845+
},
846+
{
847+
name: "prometheus with nil prometheus config",
848+
shard: &kaiv1.SchedulingShard{
849+
Spec: kaiv1.SchedulingShardSpec{
850+
UsageDBConfig: &usagedbapi.UsageDBConfig{
851+
ClientType: "prometheus",
852+
},
853+
},
854+
},
855+
kaiConfig: &kaiv1.Config{
856+
Spec: kaiv1.ConfigSpec{
857+
Namespace: "kai-system",
858+
Prometheus: nil,
859+
Global: &kaiv1.GlobalConfig{
860+
ExternalTSDBConnection: &kaiv1.Connection{
861+
URL: ptr.To("http://external-tsdb:9090"),
862+
},
863+
},
864+
},
865+
},
866+
expectError: false,
867+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
868+
assert.NotNil(t, result)
869+
assert.Equal(t, "prometheus", result.ClientType)
870+
assert.Equal(t, "http://external-tsdb:9090", result.ConnectionString)
871+
},
872+
},
873+
{
874+
name: "prometheus with prometheus.enabled = false",
875+
shard: &kaiv1.SchedulingShard{
876+
Spec: kaiv1.SchedulingShardSpec{
877+
UsageDBConfig: &usagedbapi.UsageDBConfig{
878+
ClientType: "prometheus",
879+
},
880+
},
881+
},
882+
kaiConfig: &kaiv1.Config{
883+
Spec: kaiv1.ConfigSpec{
884+
Namespace: "kai-system",
885+
Prometheus: &kaiprometheus.Prometheus{
886+
Enabled: ptr.To(false),
887+
},
888+
Global: &kaiv1.GlobalConfig{
889+
ExternalTSDBConnection: &kaiv1.Connection{
890+
URL: ptr.To("http://external-tsdb:9090"),
891+
},
892+
},
893+
},
894+
},
895+
expectError: false,
896+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
897+
assert.NotNil(t, result)
898+
assert.Equal(t, "prometheus", result.ClientType)
899+
assert.Equal(t, "http://external-tsdb:9090", result.ConnectionString)
900+
},
901+
},
902+
{
903+
name: "prometheus with nil external TSDB connection",
904+
shard: &kaiv1.SchedulingShard{
905+
Spec: kaiv1.SchedulingShardSpec{
906+
UsageDBConfig: &usagedbapi.UsageDBConfig{
907+
ClientType: "prometheus",
908+
},
909+
},
910+
},
911+
kaiConfig: &kaiv1.Config{
912+
Spec: kaiv1.ConfigSpec{
913+
Namespace: "kai-system",
914+
Global: &kaiv1.GlobalConfig{
915+
ExternalTSDBConnection: nil,
916+
},
917+
},
918+
},
919+
expectError: true,
920+
errorMsg: "prometheus connection string not configured",
921+
},
922+
{
923+
name: "prometheus with nil external TSDB URL",
924+
shard: &kaiv1.SchedulingShard{
925+
Spec: kaiv1.SchedulingShardSpec{
926+
UsageDBConfig: &usagedbapi.UsageDBConfig{
927+
ClientType: "prometheus",
928+
},
929+
},
930+
},
931+
kaiConfig: &kaiv1.Config{
932+
Spec: kaiv1.ConfigSpec{
933+
Namespace: "kai-system",
934+
Global: &kaiv1.GlobalConfig{
935+
ExternalTSDBConnection: &kaiv1.Connection{
936+
URL: nil,
937+
},
938+
},
939+
},
940+
},
941+
expectError: true,
942+
errorMsg: "prometheus connection string not configured",
943+
},
944+
{
945+
name: "prometheus with nil global config",
946+
shard: &kaiv1.SchedulingShard{
947+
Spec: kaiv1.SchedulingShardSpec{
948+
UsageDBConfig: &usagedbapi.UsageDBConfig{
949+
ClientType: "prometheus",
950+
},
951+
},
952+
},
953+
kaiConfig: &kaiv1.Config{
954+
Spec: kaiv1.ConfigSpec{
955+
Namespace: "kai-system",
956+
Global: nil,
957+
},
958+
},
959+
expectError: true,
960+
errorMsg: "prometheus connection string not configured",
961+
},
962+
{
963+
name: "deep copy preserves usage params",
964+
shard: &kaiv1.SchedulingShard{
965+
Spec: kaiv1.SchedulingShardSpec{
966+
UsageDBConfig: &usagedbapi.UsageDBConfig{
967+
ClientType: "prometheus",
968+
ConnectionString: "http://prometheus:9090",
969+
UsageParams: &usagedbapi.UsageParams{
970+
HalfLifePeriod: &metav1.Duration{Duration: 10 * time.Minute},
971+
WindowSize: &metav1.Duration{Duration: 20 * time.Minute},
972+
},
973+
},
974+
},
975+
},
976+
kaiConfig: &kaiv1.Config{},
977+
expectError: false,
978+
validate: func(t *testing.T, result *usagedbapi.UsageDBConfig) {
979+
assert.NotNil(t, result)
980+
assert.NotNil(t, result.UsageParams)
981+
assert.Equal(t, 10*time.Minute, result.UsageParams.HalfLifePeriod.Duration)
982+
assert.Equal(t, 20*time.Minute, result.UsageParams.WindowSize.Duration)
983+
},
984+
},
985+
}
986+
987+
for _, tt := range tests {
988+
t.Run(tt.name, func(t *testing.T) {
989+
result, err := getUsageDBConfig(tt.shard, tt.kaiConfig)
990+
991+
if tt.expectError {
992+
require.Error(t, err)
993+
if tt.errorMsg != "" {
994+
assert.Contains(t, err.Error(), tt.errorMsg)
995+
}
996+
} else {
997+
require.NoError(t, err)
998+
if tt.validate != nil {
999+
tt.validate(t, result)
1000+
}
1001+
}
1002+
})
1003+
}
1004+
}

pkg/scheduler/cache/usagedb/api/interface.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ type Interface interface {
1313
}
1414
type UsageDBConfig struct {
1515
ClientType string `yaml:"clientType" json:"clientType"`
16-
ConnectionString string `yaml:"connectionString" json:"connectionString"`
16+
ConnectionString string `yaml:"connectionString,omitempty" json:"connectionString,omitempty"`
1717
ConnectionStringEnvVar string `yaml:"connectionStringEnvVar,omitempty" json:"connectionStringEnvVar,omitempty"`
1818
UsageParams *UsageParams `yaml:"usageParams,omitempty" json:"usageParams,omitempty"`
1919
}

0 commit comments

Comments
 (0)