Skip to content

Improve health check for wallet backend services #251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e_integration_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
shell: bash

- name: Run Docker Compose (Without RPC)
run: docker compose -f docker-compose.yaml up --build -V -d --scale stellar-rpc=0
run: docker compose -f docker-compose.yaml up --build -V -d api ingest --wait --scale stellar-rpc=0
shell: bash

- name: Run Integration Tests
Expand Down
9 changes: 1 addition & 8 deletions cmd/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func (c *ingestCmd) Command() *cobra.Command {
utils.StartLedgerOption(&cfg.StartLedger),
utils.EndLedgerOption(&cfg.EndLedger),
utils.NetworkPassphraseOption(&cfg.NetworkPassphrase),
utils.IngestServerPortOption(&cfg.ServerPort),
{
Name: "ledger-cursor-name",
Usage: "Name of last synced ledger cursor, used to keep track of the last ledger ingested by the service. When starting up, ingestion will resume from the ledger number stored in this record. It should be an unique name per container as different containers would overwrite the cursor value of its peers when using the same cursor name.",
Expand All @@ -37,14 +38,6 @@ func (c *ingestCmd) Command() *cobra.Command {
FlagDefault: "live_ingest_cursor",
Required: true,
},
{
Name: "start",
Usage: "Ledger number from which ingestion should start. When not present, ingestion will resume from last synced ledger.",
OptType: types.Int,
ConfigKey: &cfg.StartLedger,
FlagDefault: 0,
Required: false,
},
}

cmd := &cobra.Command{
Expand Down
13 changes: 12 additions & 1 deletion cmd/utils/global_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ import (
"github.com/stellar/wallet-backend/internal/signing"
)

func IngestServerPortOption(configKey *int) *config.ConfigOption {
return &config.ConfigOption{
Name: "ingest-server-port",
Usage: "The port for the ingest server.",
OptType: types.Int,
ConfigKey: configKey,
FlagDefault: 8002,
Required: false,
}
}

func DatabaseURLOption(configKey *string) *config.ConfigOption {
return &config.ConfigOption{
Name: "database-url",
Expand Down Expand Up @@ -144,7 +155,7 @@ func DistributionAccountSignatureClientProviderOption(configKey *signing.Signatu
func StartLedgerOption(configKey *int) *config.ConfigOption {
return &config.ConfigOption{
Name: "start-ledger",
Usage: "ledger number to start getting transactions from",
Usage: "ledger number from which ingestion should start. When not present, ingestion will resume from last synced ledger.",
OptType: types.Int,
ConfigKey: configKey,
FlagDefault: 0,
Expand Down
26 changes: 21 additions & 5 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ services:
container_name: api
image: stellar/wallet-backend:development
healthcheck:
test: "curl --fail --silent --show-error --location 'http://localhost:8001/health' | grep -q '\"status\": \"pass\"'"
test: "curl --fail --silent --show-error --location 'http://localhost:8001/health' | grep -q '\"status\": \"ok\"'"
interval: 10s
timeout: 10s
retries: 3
Expand All @@ -50,14 +50,15 @@ services:
condition: service_healthy
stellar-rpc:
condition: service_started
ingest:
condition: service_healthy
ports:
- 8001:8001
entrypoint: ""
command:
- sh
- -c
- |
./wallet-backend migrate up
./wallet-backend channel-account ensure ${NUMBER_CHANNEL_ACCOUNTS:-2}
./wallet-backend serve
environment:
Expand Down Expand Up @@ -95,21 +96,36 @@ services:
ingest:
container_name: ingest
image: stellar/wallet-backend:development
healthcheck:
test: "curl --fail --silent --show-error --location 'http://localhost:8002/health' | grep -q '\"status\": \"ok\"'"
interval: 10s
timeout: 10s
retries: 3
build:
context: ./
dockerfile: Dockerfile
depends_on:
db:
condition: service_healthy
api:
condition: service_healthy
stellar-rpc:
condition: service_started
entrypoint: ""
command:
- sh
- -c
- ./wallet-backend ingest
- |
./wallet-backend migrate up
if [ "$STELLAR_ENVIRONMENT" = "GITHUB_WORKFLOW" ]; then
HEALTH_RESPONSE=$(curl -s -X POST -H "Content-Type: application/json" -d '{"jsonrpc":"2.0","id":1,"method":"getHealth"}' "${RPC_URL}")
LATEST_LEDGER=$(echo "$$HEALTH_RESPONSE" | grep -oE '"latestLedger":[0-9]+' | grep -oE '[0-9]+' || true)
if [ -z "$$LATEST_LEDGER" ] || [ "$$LATEST_LEDGER" = "" ]; then
./wallet-backend ingest
else
./wallet-backend ingest --start-ledger "$$LATEST_LEDGER"
fi
else
./wallet-backend ingest
fi
environment:
RPC_URL: ${RPC_URL:-http://stellar-rpc:8000}
DATABASE_URL: postgres://postgres@db:5432/wallet-backend?sslmode=disable
Expand Down
53 changes: 49 additions & 4 deletions internal/ingest/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"context"
"fmt"
"net/http"
"os"
"os/signal"
"syscall"
"time"

"github.com/prometheus/client_golang/prometheus/promhttp"
Expand All @@ -14,13 +17,19 @@ import (
"github.com/stellar/wallet-backend/internal/data"
"github.com/stellar/wallet-backend/internal/db"
"github.com/stellar/wallet-backend/internal/metrics"
httphandler "github.com/stellar/wallet-backend/internal/serve/httphandler"
"github.com/stellar/wallet-backend/internal/services"
"github.com/stellar/wallet-backend/internal/signing/store"
cache "github.com/stellar/wallet-backend/internal/store"
)

const (
ServerShutdownTimeout = 10 * time.Second
)

type Configs struct {
DatabaseURL string
ServerPort int
LedgerCursorName string
StartLedger int
EndLedger int
Expand Down Expand Up @@ -76,13 +85,49 @@ func setupDeps(cfg Configs) (services.IngestService, error) {
return nil, fmt.Errorf("instantiating ingest service: %w", err)
}

http.Handle("/ingest-metrics", promhttp.HandlerFor(metricsService.GetRegistry(), promhttp.HandlerOpts{}))
// Start ingest server which serves metrics and health check endpoints.
server := startServers(cfg, models, rpcService, metricsService)

// Wait for termination signal to gracefully shut down the server.
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
go func() {
err := http.ListenAndServe(":8002", nil)
if err != nil {
log.Ctx(context.Background()).Fatalf("starting ingest metrics server: %v", err)
<-quit
log.Info("Shutting down server...")

ctx, cancel := context.WithTimeout(context.Background(), ServerShutdownTimeout)
defer cancel()

if err := server.Shutdown(ctx); err != nil {
log.Errorf("Server forced to shutdown: %v", err)
}
log.Info("Server gracefully stopped")
}()

return ingestService, nil
}

// startServers initializes and starts the ingest server which serves metrics and health check endpoints.
func startServers(cfg Configs, models *data.Models, rpcService services.RPCService, metricsSvc metrics.MetricsService) *http.Server {
mux := http.NewServeMux()
server := &http.Server{
Addr: fmt.Sprintf(":%d", cfg.ServerPort),
Handler: mux,
}

healthHandler := httphandler.HealthHandler{
Models: models,
RPCService: rpcService,
AppTracker: cfg.AppTracker,
}
mux.Handle("/ingest-metrics", promhttp.HandlerFor(metricsSvc.GetRegistry(), promhttp.HandlerOpts{}))
mux.Handle("/health", http.HandlerFunc(healthHandler.GetHealth))

go func() {
if err := server.ListenAndServe(); err != http.ErrServerClosed {
log.Ctx(context.Background()).Fatalf("starting server on %s: %v", server.Addr, err)
}
}()

return server
}
18 changes: 18 additions & 0 deletions internal/serve/httperror/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,21 @@ func InternalServerError(ctx context.Context, message string, err error, extras
Extras: extras,
}
}

func ServiceUnavailable(ctx context.Context, message string, err error, extras map[string]interface{}, appTracker apptracker.AppTracker) *ErrorResponse {
if message == "" {
message = "The service is unavailable."
}
log.Ctx(ctx).Error(err)
if appTracker != nil {
appTracker.CaptureException(err)
} else {
log.Warn("App Tracker is nil")
}

return &ErrorResponse{
Status: http.StatusServiceUnavailable,
Error: message,
Extras: extras,
}
}
61 changes: 61 additions & 0 deletions internal/serve/httphandler/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package httphandler

import (
"errors"
"fmt"
"net/http"

"github.com/stellar/go/support/render/httpjson"

"github.com/stellar/wallet-backend/internal/apptracker"
"github.com/stellar/wallet-backend/internal/data"
"github.com/stellar/wallet-backend/internal/serve/httperror"
"github.com/stellar/wallet-backend/internal/services"
)

type HealthHandler struct {
Models *data.Models
RPCService services.RPCService
AppTracker apptracker.AppTracker
}

const (
ledgerCursorName = "live_ingest_cursor"
ledgerHealthThreshold = uint32(50)
)

func (h HealthHandler) GetHealth(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()

rpcHealth, err := h.RPCService.GetHealth()
if err != nil {
err = fmt.Errorf("failed to get RPC health: %w", err)
httperror.InternalServerError(ctx, err.Error(), err, nil, h.AppTracker).Render(w)
return
}
if rpcHealth.Status != "healthy" {
err = errors.New("rpc is not healthy")
httperror.ServiceUnavailable(ctx, err.Error(), err, nil, h.AppTracker).Render(w)
return
}

backendLatestLedger, err := h.Models.IngestStore.GetLatestLedgerSynced(ctx, ledgerCursorName)
if err != nil {
err = fmt.Errorf("failed to get backend latest ledger: %w", err)
httperror.InternalServerError(ctx, err.Error(), err, nil, h.AppTracker).Render(w)
return
}
if rpcHealth.LatestLedger-backendLatestLedger > ledgerHealthThreshold {
err = errors.New("wallet backend is not in sync with the RPC")
httperror.ServiceUnavailable(ctx, err.Error(), err, map[string]interface{}{
"rpc_latest_ledger": rpcHealth.LatestLedger,
"backend_latest_ledger": backendLatestLedger,
}, h.AppTracker).Render(w)
return
}

httpjson.Render(w, map[string]any{
"status": "ok",
"backend_latest_ledger": backendLatestLedger,
}, httpjson.JSON)
}
Loading