Skip to content

Commit 0aa520e

Browse files
authored
chore: refactor janitor to commons (#258)
1 parent 85c726b commit 0aa520e

File tree

3 files changed

+56
-20
lines changed

3 files changed

+56
-20
lines changed

health-monitors/gpu-health-monitor/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ RUN echo "Acquire::https::Verify-Peer \"false\";" > /etc/apt/apt.conf.d/99disabl
3737

3838
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
3939
--mount=type=cache,target=/var/lib/apt,sharing=locked \
40+
echo 'Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries && \
4041
apt-get update && \
4142
apt-get install -y --no-install-recommends \
4243
apt-transport-https \

janitor/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ require (
1919
github.com/prometheus/client_golang v1.23.2
2020
github.com/spf13/viper v1.21.0
2121
github.com/stretchr/testify v1.11.1
22+
golang.org/x/sync v0.17.0
2223
k8s.io/api v0.34.1
2324
k8s.io/apimachinery v0.34.1
2425
k8s.io/client-go v0.34.1
@@ -130,7 +131,6 @@ require (
130131
golang.org/x/mod v0.29.0 // indirect
131132
golang.org/x/net v0.46.0 // indirect
132133
golang.org/x/oauth2 v0.32.0 // indirect
133-
golang.org/x/sync v0.17.0 // indirect
134134
golang.org/x/sys v0.37.0 // indirect
135135
golang.org/x/term v0.36.0 // indirect
136136
golang.org/x/text v0.30.0 // indirect

janitor/main.go

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,17 @@ import (
1818
"crypto/tls"
1919
"encoding/json"
2020
"flag"
21+
"fmt"
2122
"log/slog"
23+
"net"
2224
"net/http"
2325
"os"
2426
"path/filepath"
27+
"strconv"
2528
"time"
2629

2730
"github.com/go-logr/logr"
31+
"golang.org/x/sync/errgroup"
2832
"k8s.io/apimachinery/pkg/runtime"
2933
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3034
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -37,6 +41,7 @@ import (
3741
"sigs.k8s.io/controller-runtime/pkg/webhook"
3842

3943
"github.com/nvidia/nvsentinel/commons/pkg/logger"
44+
"github.com/nvidia/nvsentinel/commons/pkg/server"
4045
janitordgxcnvidiacomv1alpha1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1"
4146
"github.com/nvidia/nvsentinel/janitor/pkg/config"
4247
"github.com/nvidia/nvsentinel/janitor/pkg/controller"
@@ -145,27 +150,41 @@ func run() error {
145150
"terminateNode.timeout", cfg.TerminateNode.Timeout,
146151
"global.manualMode", cfg.Global.ManualMode)
147152

148-
// Start a simple HTTP server for the /config endpoint
149-
go func() {
150-
mux := http.NewServeMux()
151-
mux.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) {
152-
w.Header().Set("Content-Type", "application/json")
153+
// Parse config port from address
154+
// Handles formats like ":8082", "localhost:8082", "0.0.0.0:8082"
155+
_, portStr, err := net.SplitHostPort(configAddr)
156+
if err != nil {
157+
// If SplitHostPort fails, assume it's just a port number
158+
portStr = configAddr
159+
if portStr != "" && portStr[0] == ':' {
160+
portStr = portStr[1:]
161+
}
162+
}
153163

154-
if err := json.NewEncoder(w).Encode(cfg); err != nil {
155-
slog.Error("Failed to encode configuration as JSON", "error", err)
156-
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
164+
configPort, err := strconv.Atoi(portStr)
165+
if err != nil {
166+
slog.Error("Invalid config-bind-address port", "error", err, "address", configAddr)
167+
return fmt.Errorf("invalid config-bind-address port %q: %w", configAddr, err)
168+
}
157169

158-
return
159-
}
160-
})
170+
// Create config handler
171+
configHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
172+
w.Header().Set("Content-Type", "application/json")
161173

162-
slog.Info("Starting config HTTP server", "address", configAddr)
174+
if err := json.NewEncoder(w).Encode(cfg); err != nil {
175+
slog.Error("Failed to encode configuration as JSON", "error", err)
176+
http.Error(w, "Internal Server Error", http.StatusInternalServerError)
163177

164-
// nolint:gosec // G114: Config endpoint is for internal debugging/monitoring
165-
if err := http.ListenAndServe(configAddr, mux); err != nil {
166-
slog.Error("Config HTTP server failed", "error", err)
178+
return
167179
}
168-
}()
180+
})
181+
182+
// Create config server using common server implementation
183+
// Note: Health checks are handled by controller-runtime manager on probeAddr
184+
configServer := server.NewServer(
185+
server.WithPort(configPort),
186+
server.WithHandler("/config", configHandler),
187+
)
169188

170189
// Setup TLS options
171190
var tlsOpts []func(*tls.Config)
@@ -324,10 +343,26 @@ func run() error {
324343
return err
325344
}
326345

327-
slog.Info("Starting manager")
346+
// Setup signal handler for graceful shutdown
347+
ctx := ctrl.SetupSignalHandler()
348+
349+
// Use errgroup to manage both the config server and controller manager
350+
g, gCtx := errgroup.WithContext(ctx)
351+
352+
// Start config server
353+
g.Go(func() error {
354+
slog.Info("Starting config server", "port", configPort)
355+
return configServer.Serve(gCtx)
356+
})
357+
358+
// Start controller manager
359+
g.Go(func() error {
360+
slog.Info("Starting manager")
361+
return mgr.Start(gCtx)
362+
})
328363

329-
if err = mgr.Start(ctrl.SetupSignalHandler()); err != nil {
330-
slog.Error("Problem running manager", "error", err)
364+
// Wait for both to complete
365+
if err := g.Wait(); err != nil {
331366
return err
332367
}
333368

0 commit comments

Comments
 (0)