Skip to content

Commit 002473a

Browse files
authored
feat: add externalURL config and support (#563)
* feat: add external prometheus support * feat: implement monitoring mechanism * refactor: move monitoring code to operator, expand operator interface
1 parent a765b25 commit 002473a

File tree

18 files changed

+608
-5
lines changed

18 files changed

+608
-5
lines changed

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2012,6 +2012,29 @@ spec:
20122012
description: Enabled defines whether a Prometheus instance should
20132013
be deployed
20142014
type: boolean
2015+
externalPrometheusHealthProbe:
2016+
description: ExternalPrometheusPingConfig defines the configuration
2017+
for external Prometheus connectivity validation, with defaults.
2018+
properties:
2019+
interval:
2020+
description: Interval defines the interval for external Prometheus
2021+
connectivity validation (in seconds)
2022+
type: integer
2023+
maxRetries:
2024+
description: PingsMaxRetries defines the maximum number of
2025+
retries for external Prometheus connectivity validation
2026+
type: integer
2027+
timeout:
2028+
description: PingsTimeout defines the timeout for external
2029+
Prometheus connectivity validation (in seconds)
2030+
type: integer
2031+
type: object
2032+
externalPrometheusUrl:
2033+
description: |-
2034+
ExternalPrometheusUrl defines the URL of an external Prometheus instance to use
2035+
When set, KAI will not deploy its own Prometheus but will configure ServiceMonitors
2036+
for the external instance and validate connectivity
2037+
type: string
20152038
retentionPeriod:
20162039
description: RetentionPeriod defines how long to retain data (e.g.,
20172040
"2w", "1d", "30d")

pkg/apis/kai/v1/config_types.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ const (
3333
type ConditionReason string
3434

3535
const (
36-
Deployed ConditionReason = "deployed"
37-
Available ConditionReason = "available"
38-
Reconciled ConditionReason = "reconciled"
39-
DependenciesFulfilled ConditionReason = "dependencies_fulfilled"
36+
Deployed ConditionReason = "deployed"
37+
Available ConditionReason = "available"
38+
Reconciled ConditionReason = "reconciled"
39+
DependenciesFulfilled ConditionReason = "dependencies_fulfilled"
40+
PrometheusConnected ConditionReason = "prometheus_connected"
41+
PrometheusConnectionFailed ConditionReason = "prometheus_connection_failed"
4042
)
4143

4244
// ConfigSpec defines the desired state of Config

pkg/apis/kai/v1/prometheus/prometheus.go

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,30 @@ type Prometheus struct {
4343
// ServiceMonitor defines ServiceMonitor configuration for KAI services
4444
// +kubebuilder:validation:Optional
4545
ServiceMonitor *ServiceMonitor `json:"serviceMonitor,omitempty"`
46+
47+
// ExternalPrometheusUrl defines the URL of an external Prometheus instance to use
48+
// When set, KAI will not deploy its own Prometheus but will configure ServiceMonitors
49+
// for the external instance and validate connectivity
50+
// +kubebuilder:validation:Optional
51+
ExternalPrometheusUrl *string `json:"externalPrometheusUrl,omitempty"`
52+
53+
// ExternalPrometheusPingConfig defines the configuration for external Prometheus connectivity validation, with defaults.
54+
// +kubebuilder:validation:Optional
55+
ExternalPrometheusHealthProbe *ExternalPrometheusHealthProbe `json:"externalPrometheusHealthProbe,omitempty"`
56+
}
57+
58+
type ExternalPrometheusHealthProbe struct {
59+
// Interval defines the interval for external Prometheus connectivity validation (in seconds)
60+
// +kubebuilder:validation:Optional
61+
Interval *int `json:"interval,omitempty"`
62+
63+
// PingsTimeout defines the timeout for external Prometheus connectivity validation (in seconds)
64+
// +kubebuilder:validation:Optional
65+
Timeout *int `json:"timeout,omitempty"`
66+
67+
// PingsMaxRetries defines the maximum number of retries for external Prometheus connectivity validation
68+
// +kubebuilder:validation:Optional
69+
MaxRetries *int `json:"maxRetries,omitempty"`
4670
}
4771

4872
func (p *Prometheus) SetDefaultsWhereNeeded() {
@@ -53,7 +77,9 @@ func (p *Prometheus) SetDefaultsWhereNeeded() {
5377
p.RetentionPeriod = common.SetDefault(p.RetentionPeriod, ptr.To("2w"))
5478
p.SampleInterval = common.SetDefault(p.SampleInterval, ptr.To("1m"))
5579
p.StorageClassName = common.SetDefault(p.StorageClassName, ptr.To("standard"))
56-
80+
p.ExternalPrometheusUrl = common.SetDefault(p.ExternalPrometheusUrl, nil)
81+
p.ExternalPrometheusHealthProbe = common.SetDefault(p.ExternalPrometheusHealthProbe, &ExternalPrometheusHealthProbe{})
82+
p.ExternalPrometheusHealthProbe.SetDefaultsWhereNeeded()
5783
p.ServiceMonitor = common.SetDefault(p.ServiceMonitor, &ServiceMonitor{})
5884
p.ServiceMonitor.SetDefaultsWhereNeeded()
5985
}
@@ -253,3 +279,12 @@ func (s *ServiceMonitor) SetDefaultsWhereNeeded() {
253279
s.ScrapeTimeout = common.SetDefault(s.ScrapeTimeout, ptr.To("10s"))
254280
s.BearerTokenFile = common.SetDefault(s.BearerTokenFile, ptr.To("/var/run/secrets/kubernetes.io/serviceaccount/token"))
255281
}
282+
283+
func (p *ExternalPrometheusHealthProbe) SetDefaultsWhereNeeded() {
284+
if p == nil {
285+
return
286+
}
287+
p.Interval = common.SetDefault(p.Interval, ptr.To(30))
288+
p.Timeout = common.SetDefault(p.Timeout, ptr.To(10))
289+
p.MaxRetries = common.SetDefault(p.MaxRetries, ptr.To(5))
290+
}

pkg/apis/kai/v1/prometheus/zz_generated.deepcopy.go

Lines changed: 40 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/operator/controller/config_controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ func (r *ConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res
123123
if err = r.deployable.Deploy(ctx, r.Client, kaiConfig, kaiConfig); err != nil {
124124
return ctrl.Result{}, err
125125
}
126+
127+
// Monitor all operands
128+
if err = r.deployable.Monitor(ctx, r.Client, kaiConfig); err != nil {
129+
return ctrl.Result{}, err
130+
}
131+
126132
return ctrl.Result{}, nil
127133
}
128134

pkg/operator/operands/admission/admission.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,7 @@ func (b *Admission) IsAvailable(ctx context.Context, readerClient client.Reader)
7373
func (b *Admission) Name() string {
7474
return "KAIAdmission"
7575
}
76+
77+
func (b *Admission) Monitor(ctx context.Context, runtimeReader client.Reader, kaiConfig *kaiv1.Config) error {
78+
return nil
79+
}

pkg/operator/operands/binder/binder.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,7 @@ func (b *Binder) IsAvailable(ctx context.Context, readerClient client.Reader) (b
5858
func (b *Binder) Name() string {
5959
return "Binder"
6060
}
61+
62+
func (b *Binder) Monitor(ctx context.Context, runtimeReader client.Reader, kaiConfig *kaiv1.Config) error {
63+
return nil
64+
}

pkg/operator/operands/deployable/deployable.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,3 +292,12 @@ func createObjectForKAIConfig(
292292

293293
return nil
294294
}
295+
296+
func (d *DeployableOperands) Monitor(ctx context.Context, runtimeReader client.Reader, kaiConfig *kaiv1.Config) error {
297+
for _, operand := range d.operands {
298+
if err := operand.Monitor(ctx, runtimeReader, kaiConfig); err != nil {
299+
return fmt.Errorf("failed monitoring %s: %v", operand.Name(), err)
300+
}
301+
}
302+
return nil
303+
}

pkg/operator/operands/deployable/deployable_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ func (f *fakeOperand) IsAvailable(_ context.Context, _ client.Reader) (bool, err
351351
return f.isAvailable, nil
352352
}
353353

354+
func (f *fakeOperand) Monitor(_ context.Context, _ client.Reader, _ *kaiv1.Config) error {
355+
return nil
356+
}
357+
354358
func (f *fakeOperand) Name() string {
355359
if f.name == "" {
356360
return "fakeOperand"

pkg/operator/operands/interface.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,6 @@ type Operand interface {
1414
DesiredState(context.Context, client.Reader, *enginev1alpha1.Config) ([]client.Object, error)
1515
IsDeployed(context.Context, client.Reader) (bool, error)
1616
IsAvailable(context.Context, client.Reader) (bool, error)
17+
Monitor(context.Context, client.Reader, *enginev1alpha1.Config) error
1718
Name() string
1819
}

0 commit comments

Comments
 (0)