diff --git a/CHANGELOG.md b/CHANGELOG.md index d762d7f6c..9fc6e7d1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - DispatchQueryPlan previously did not try to use the singleflight middleware for check calls. (https://github.com/authzed/spicedb/pull/3119) - Fixed regression introduced in 1.53.0. Postgres `HeadRevision` no longer allocates a new transaction ID on every call (https://github.com/authzed/spicedb/pull/3127) - Fixed regression introduced in 1.53.0 for MySQL migration scripts (https://github.com/authzed/spicedb/pull/3129) +- Tracing: When server is shutting down, flush traces. Also, elide the need for setting `OTEL_EXPORTER_OTLP_ENDPOINT`. (https://github.com/authzed/spicedb/pull/3108) ## [1.53.0] - 2026-05-13 ### Added diff --git a/docker-compose.memdb.yaml b/docker-compose.memdb.yaml index f3739bce6..6fb60712d 100644 --- a/docker-compose.memdb.yaml +++ b/docker-compose.memdb.yaml @@ -23,7 +23,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: otel-collector: diff --git a/docker-compose.mysql.yaml b/docker-compose.mysql.yaml index 82613e3e1..188dc04af 100644 --- a/docker-compose.mysql.yaml +++ b/docker-compose.mysql.yaml @@ -79,7 +79,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: @@ -116,7 +117,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=true" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: diff --git a/docker-compose.pgbouncer.yaml b/docker-compose.pgbouncer.yaml index 25fc5ae8d..1f947a21e 100644 --- a/docker-compose.pgbouncer.yaml +++ b/docker-compose.pgbouncer.yaml @@ -161,7 +161,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_DATASTORE_FOLLOWER_READ_DELAY_DURATION=2000ms" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" - "SPICEDB_DATASTORE_INCLUDE_QUERY_PARAMETERS_IN_TRACES=true" @@ -204,7 +205,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=true" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_DATASTORE_FOLLOWER_READ_DELAY_DURATION=2000ms" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" - "SPICEDB_DATASTORE_INCLUDE_QUERY_PARAMETERS_IN_TRACES=true" diff --git a/docker-compose.postgres.yaml b/docker-compose.postgres.yaml index 396a957e4..87c87cd0b 100644 --- a/docker-compose.postgres.yaml +++ b/docker-compose.postgres.yaml @@ -108,7 +108,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_DATASTORE_FOLLOWER_READ_DELAY_DURATION=2000ms" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: @@ -150,7 +151,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=true" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_DATASTORE_FOLLOWER_READ_DELAY_DURATION=2000ms" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: diff --git a/docker-compose.spanner.yaml b/docker-compose.spanner.yaml index f0e2f9989..060e8f049 100644 --- a/docker-compose.spanner.yaml +++ b/docker-compose.spanner.yaml @@ -79,7 +79,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: @@ -117,7 +118,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=true" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: diff --git a/docker-compose.yaml b/docker-compose.yaml index 7404dead4..53afe2421 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -75,7 +75,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=false" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: @@ -112,7 +113,8 @@ services: - "SPICEDB_GRPC_LOG_REQUESTS_ENABLED=true" - "SPICEDB_GRPC_LOG_RESPONSES_ENABLED=false" - "SPICEDB_STREAMING_API_RESPONSE_DELAY_TIMEOUT=0s" - - "OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317" + - "SPICEDB_OTEL_ENDPOINT=otel-collector:4317" + - "SPICEDB_OTEL_INSECURE=true" - "SPICEDB_ENABLE_PERFORMANCE_INSIGHT_METRICS=true" depends_on: migrate: diff --git a/docs/spicedb.md b/docs/spicedb.md index a2a87e9e2..3efc6bd7c 100644 --- a/docs/spicedb.md +++ b/docs/spicedb.md @@ -123,12 +123,6 @@ spicedb datastore gc [flags] --datastore-watch-buffer-write-timeout duration how long the watch buffer should queue before forcefully disconnecting the reader (default 1s) --datastore-watch-change-buffer-maximum-size string how much memory to reserve for the watch change buffer, either as a quantity of bytes (e.g. 5Gi) or a percentage of available memory (e.g. 50%). if this value is exceeded, the watch will error and must be restarted. (default "15%") --datastore-watch-connect-timeout duration how long the watch connection to the underlying datastore should wait before timing out (CockroachDB driver only) (default 1s) - --otel-endpoint string OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables - --otel-insecure connect to the OpenTelemetry collector in plaintext - --otel-provider string OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc") (default "none") - --otel-sample-ratio float ratio of traces that are sampled (default 0.01) - --otel-service-name string service name for trace data (default "spicedb") - --otel-trace-propagator string OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma. (default "w3c") --pprof-block-profile-rate int sets the block profile sampling rate (between 0 and 1) --pprof-mutex-profile-rate int sets the mutex profile sampling rate (between 0 and 1) --termination-log-path string local file path for Kubernetes terminationMessagePath; written with a JSON exit reason on TerminationError; disabled when empty @@ -157,12 +151,6 @@ spicedb datastore head [flags] ``` --datastore-engine string type of datastore to initialize ("cockroachdb", "mysql", "postgres", "spanner") (default "postgres") - --otel-endpoint string OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables - --otel-insecure connect to the OpenTelemetry collector in plaintext - --otel-provider string OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc") (default "none") - --otel-sample-ratio float ratio of traces that are sampled (default 0.01) - --otel-service-name string service name for trace data (default "spicedb") - --otel-trace-propagator string OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma. (default "w3c") --pprof-block-profile-rate int sets the block profile sampling rate (between 0 and 1) --pprof-mutex-profile-rate int sets the mutex profile sampling rate (between 0 and 1) --termination-log-path string local file path for Kubernetes terminationMessagePath; written with a JSON exit reason on TerminationError; disabled when empty @@ -198,12 +186,6 @@ spicedb datastore migrate [revision] [flags] --datastore-spanner-emulator-host string URI of spanner emulator instance used for development and testing (e.g. localhost:9010) --migration-backfill-batch-size uint number of items to migrate per iteration of a datastore backfill (default 1000) --migration-timeout duration defines a timeout for the execution of the migration, set to 1 hour by default (default 1h0m0s) - --otel-endpoint string OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables - --otel-insecure connect to the OpenTelemetry collector in plaintext - --otel-provider string OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc") (default "none") - --otel-sample-ratio float ratio of traces that are sampled (default 0.01) - --otel-service-name string service name for trace data (default "spicedb") - --otel-trace-propagator string OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma. (default "w3c") --pprof-block-profile-rate int sets the block profile sampling rate (between 0 and 1) --pprof-mutex-profile-rate int sets the mutex profile sampling rate (between 0 and 1) --termination-log-path string local file path for Kubernetes terminationMessagePath; written with a JSON exit reason on TerminationError; disabled when empty @@ -286,12 +268,6 @@ spicedb datastore repair [flags] --datastore-watch-buffer-write-timeout duration how long the watch buffer should queue before forcefully disconnecting the reader (default 1s) --datastore-watch-change-buffer-maximum-size string how much memory to reserve for the watch change buffer, either as a quantity of bytes (e.g. 5Gi) or a percentage of available memory (e.g. 50%). if this value is exceeded, the watch will error and must be restarted. (default "15%") --datastore-watch-connect-timeout duration how long the watch connection to the underlying datastore should wait before timing out (CockroachDB driver only) (default 1s) - --otel-endpoint string OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables - --otel-insecure connect to the OpenTelemetry collector in plaintext - --otel-provider string OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc") (default "none") - --otel-sample-ratio float ratio of traces that are sampled (default 0.01) - --otel-service-name string service name for trace data (default "spicedb") - --otel-trace-propagator string OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma. (default "w3c") --pprof-block-profile-rate int sets the block profile sampling rate (between 0 and 1) --pprof-mutex-profile-rate int sets the mutex profile sampling rate (between 0 and 1) --termination-log-path string local file path for Kubernetes terminationMessagePath; written with a JSON exit reason on TerminationError; disabled when empty @@ -603,12 +579,6 @@ spicedb serve-testing [flags] --max-lookup-resources-limit uint32 maximum number of resources that can be looked up in a single request (default 1000) --max-read-relationships-limit uint32 maximum number of relationships that can be read in a single request (default 1000) --max-relationship-context-size int maximum allowed size of the context to be stored in a relationship (default 25000) - --otel-endpoint string OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables - --otel-insecure connect to the OpenTelemetry collector in plaintext - --otel-provider string OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc") (default "none") - --otel-sample-ratio float ratio of traces that are sampled (default 0.01) - --otel-service-name string service name for trace data (default "spicedb") - --otel-trace-propagator string OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma. (default "w3c") --pprof-block-profile-rate int sets the block profile sampling rate (between 0 and 1) --pprof-mutex-profile-rate int sets the mutex profile sampling rate (between 0 and 1) --readonly-grpc-addr string address to listen on to serve read-only gRPC (default ":50052") diff --git a/go.mod b/go.mod index e47763174..7a5ca3166 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,11 @@ require ( github.com/wasilibs/go-pgquery v0.0.0-20250409022910-10ac41983c07 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 + go.opentelemetry.io/contrib/propagators/b3 v1.20.0 + go.opentelemetry.io/contrib/propagators/ot v1.20.0 go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/exporters/prometheus v0.62.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 @@ -247,11 +251,7 @@ require ( go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.43.0 // indirect - go.opentelemetry.io/contrib/propagators/b3 v1.20.0 // indirect - go.opentelemetry.io/contrib/propagators/ot v1.20.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect diff --git a/pkg/cmd/serve.go b/pkg/cmd/serve.go index 846c920dd..237e8541e 100644 --- a/pkg/cmd/serve.go +++ b/pkg/cmd/serve.go @@ -9,7 +9,6 @@ import ( "github.com/fatih/color" "github.com/jzelinskie/cobrautil/v2" - "github.com/jzelinskie/cobrautil/v2/cobraotel" "github.com/spf13/cobra" "github.com/authzed/spicedb/internal/telemetry" @@ -214,11 +213,8 @@ func RegisterServeFlags(cmd *cobra.Command, config *server.Config) error { return fmt.Errorf("could not register stored schema cache flags: %w", err) } - tracingFlags := nfs.FlagSet(BoldBlue("Tracing")) // Flags for tracing - // NOTE: cobraotel.New takes service name as an arg rather than command name. - otel := cobraotel.New("spicedb") - otel.RegisterFlags(tracingFlags) + server.RegisterOTelFlags(cmd, &config.OTel) loggingFlagSet := nfs.FlagSet(BoldBlue("Logging")) loggingFlagSet.BoolVar(&config.EnableRequestLogs, "grpc-log-requests-enabled", false, "enable logging of API request payloads") @@ -260,14 +256,14 @@ func NewServeCommand(programName string, config *server.Config) *cobra.Command { Long: "start a SpiceDB server", PreRunE: server.DefaultPreRunE(programName), RunE: termination.PublishError(func(cmd *cobra.Command, args []string) error { - server, err := config.Complete(cmd.Context()) + srv, err := config.Complete(cmd.Context()) if err != nil { return err } signalctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() - return server.Run(signalctx) + return srv.Run(signalctx) }), Example: server.ServeExample(programName), } diff --git a/pkg/cmd/server/defaults.go b/pkg/cmd/server/defaults.go index febda4da7..606ac7df0 100644 --- a/pkg/cmd/server/defaults.go +++ b/pkg/cmd/server/defaults.go @@ -19,7 +19,6 @@ import ( grpclog "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging" "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/selector" "github.com/jzelinskie/cobrautil/v2" - "github.com/jzelinskie/cobrautil/v2/cobraotel" "github.com/jzelinskie/cobrautil/v2/cobraproclimits" "github.com/jzelinskie/cobrautil/v2/cobrazerolog" "github.com/prometheus/client_golang/prometheus" @@ -66,8 +65,8 @@ func ServeExample(programName string) string { ) } -// DefaultPreRunE sets up viper, zerolog, and OpenTelemetry flag handling for a -// command. +// DefaultPreRunE sets up viper dotenv loading, zerolog, memory and process +// limits, release version checks, and runtime instrumentation for a command. func DefaultPreRunE(programName string) cobrautil.CobraRunFunc { return cobrautil.CommandStack( cobrautil.SyncViperDotEnvPreRunE(programName, "spicedb.env", zerologr.New(&logging.Logger)), @@ -82,9 +81,6 @@ func DefaultPreRunE(programName string) cobrautil.CobraRunFunc { // and zero under the same load and 0.9 cobraproclimits.SetMemLimitRunE(memlimit.WithRatio(0.9)), cobraproclimits.SetProcLimitRunE(), - cobraotel.New("spicedb", - cobraotel.WithLogger(zerologr.New(&logging.Logger)), - ).RunE(), releases.CheckAndLogRunE(), runtime.RunE(), ) diff --git a/pkg/cmd/server/otel.go b/pkg/cmd/server/otel.go new file mode 100644 index 000000000..f2f4f2f46 --- /dev/null +++ b/pkg/cmd/server/otel.go @@ -0,0 +1,156 @@ +package server + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/spf13/cobra" + "go.opentelemetry.io/contrib/propagators/b3" + "go.opentelemetry.io/contrib/propagators/ot" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.7.0" + + log "github.com/authzed/spicedb/internal/logging" +) + +//go:generate go run github.com/ecordell/optgen -output zz_generated.oteloptions.go . OTelConfig + +const OTelShutdownTimeout = 5 * time.Second + +type otelShutdowner interface { + Shutdown(ctx context.Context) error + ForceFlush(ctx context.Context) error +} + +type OTelConfig struct { + Provider string `debugmap:"visible"` + Endpoint string `debugmap:"visible"` + ServiceName string `debugmap:"visible"` + TracePropagator string `debugmap:"visible"` + UsePlaintext bool `debugmap:"visible"` + SampleRatio float64 `debugmap:"visible"` +} + +// RegisterOTelFlags registers all OpenTelemetry flags on cmd, binding them directly into cfg. +func RegisterOTelFlags(cmd *cobra.Command, cfg *OTelConfig) { + f := cmd.Flags() + f.StringVar(&cfg.Provider, "otel-provider", "none", `OpenTelemetry provider for tracing ("none", "otlphttp", "otlpgrpc")`) + f.StringVar(&cfg.Endpoint, "otel-endpoint", "", `OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables`) + f.StringVar(&cfg.ServiceName, "otel-service-name", "spicedb", `service name for trace data`) + f.StringVar(&cfg.TracePropagator, "otel-trace-propagator", "w3c", `OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma.`) + f.BoolVar(&cfg.UsePlaintext, "otel-insecure", false, `connect to the OpenTelemetry collector in plaintext`) + f.Float64Var(&cfg.SampleRatio, "otel-sample-ratio", 0.01, `ratio of traces that are sampled`) +} + +// InitOTelProvider builds a TracerProvider from cfg, sets it as the global OTel +// provider, and returns a shutdown function. +// When cfg.Provider is "none" or empty, the returned function is a no-op. +func InitOTelProvider(ctx context.Context, cfg OTelConfig) (func() error, error) { + providerName := strings.TrimSpace(strings.ToLower(cfg.Provider)) + + if providerName == "none" || providerName == "" { + return func() error { return nil }, nil + } + + res, err := resource.New(ctx, + resource.WithAttributes(semconv.ServiceNameKey.String(cfg.ServiceName)), + resource.WithProcess(), + resource.WithOS(), + resource.WithHost(), + ) + if err != nil { + return nil, fmt.Errorf("building OTel resource: %w", err) + } + + var exporter sdktrace.SpanExporter + + switch providerName { + case "otlpgrpc": + opts := []otlptracegrpc.Option{} + if cfg.Endpoint != "" { + opts = append(opts, otlptracegrpc.WithEndpoint(cfg.Endpoint)) + } + if cfg.UsePlaintext { + opts = append(opts, otlptracegrpc.WithInsecure()) + } + exp, err := otlptracegrpc.New(ctx, opts...) + if err != nil { + return nil, fmt.Errorf("creating otlpgrpc exporter: %w", err) + } + exporter = exp + + case "otlphttp": + opts := []otlptracehttp.Option{} + if cfg.Endpoint != "" { + opts = append(opts, otlptracehttp.WithEndpoint(cfg.Endpoint)) + } + if cfg.UsePlaintext { + opts = append(opts, otlptracehttp.WithInsecure()) + } + exp, err := otlptracehttp.New(ctx, opts...) + if err != nil { + return nil, fmt.Errorf("creating otlphttp exporter: %w", err) + } + exporter = exp + + default: + return nil, fmt.Errorf( + "unknown otel-provider %q: must be one of: none, otlpgrpc, otlphttp", + providerName, + ) + } + + sampleRatio := cfg.SampleRatio + tp := sdktrace.NewTracerProvider( + sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRatio))), + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + ) + + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(buildPropagator(cfg.TracePropagator)) + + return func() error { return ShutdownOTelProvider(context.Background(), tp) }, nil +} + +// ShutdownOTelProvider flushes all pending spans then shuts the provider down. +// ForceFlush is always called before Shutdown. Safe to call with nil (no-op). +func ShutdownOTelProvider(ctx context.Context, provider otelShutdowner) error { + if provider == nil { + return nil + } + shutCtx, cancel := context.WithTimeout(ctx, OTelShutdownTimeout) + defer cancel() + log.Info().Msg("shutting down OTel provider") + if err := provider.ForceFlush(shutCtx); err != nil { + // Log but continue — Shutdown must still be attempted. + log.Warn().Err(err).Msg("otel: ForceFlush error during shutdown") + } + return provider.Shutdown(shutCtx) +} + +// buildPropagator returns the TextMapPropagator for the given name. +func buildPropagator(names string) propagation.TextMapPropagator { + var tmPropagators []propagation.TextMapPropagator + for _, p := range strings.Split(names, ",") { + switch strings.ToLower(strings.TrimSpace(p)) { + case "b3": + tmPropagators = append(tmPropagators, b3.New()) + case "ottrace": + tmPropagators = append(tmPropagators, ot.OT{}) + case "w3c": + fallthrough + default: + tmPropagators = append(tmPropagators, propagation.Baggage{}) + tmPropagators = append(tmPropagators, propagation.TraceContext{}) + } + } + return propagation.NewCompositeTextMapPropagator(tmPropagators...) +} diff --git a/pkg/cmd/server/otel_test.go b/pkg/cmd/server/otel_test.go new file mode 100644 index 000000000..7f367c838 --- /dev/null +++ b/pkg/cmd/server/otel_test.go @@ -0,0 +1,166 @@ +// pkg/cmd/server/otel_test.go +package server + +import ( + "context" + "fmt" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockShutdowner is a test double that records calls to Shutdown/ForceFlush. +type mockShutdowner struct { + shutdownCalled bool + forceFlushCalled bool + shutdownErr error + forceFlushErr error +} + +func (m *mockShutdowner) Shutdown(_ context.Context) error { + m.shutdownCalled = true + return m.shutdownErr +} + +func (m *mockShutdowner) ForceFlush(_ context.Context) error { + m.forceFlushCalled = true + return m.forceFlushErr +} + +// callOrderShutdowner records the order Shutdown/ForceFlush are called. +type callOrderShutdowner struct { + callLog *[]string +} + +func (c *callOrderShutdowner) ForceFlush(_ context.Context) error { + *c.callLog = append(*c.callLog, "ForceFlush") + return nil +} + +func (c *callOrderShutdowner) Shutdown(_ context.Context) error { + *c.callLog = append(*c.callLog, "Shutdown") + return nil +} + +// makeTestCmd creates a bare cobra.Command for flag-registration tests. +func makeTestCmd() *cobra.Command { + return &cobra.Command{Use: "test"} +} + +// TestRegisterOTelFlags_AllFlagsPresent verifies all OTel flags are +// registered with correct names after calling RegisterOTelFlags. +func TestRegisterOTelFlags_AllFlagsPresent(t *testing.T) { + cmd := makeTestCmd() + RegisterOTelFlags(cmd, &OTelConfig{}) + + for _, name := range []string{ + "otel-provider", + "otel-endpoint", + "otel-service-name", + "otel-trace-propagator", + "otel-insecure", + } { + assert.NotNil(t, cmd.Flags().Lookup(name), + "expected flag %q to be registered", name) + } +} + +// TestRegisterOTelFlags_ProviderDefault verifies otel-provider defaults to "none". +func TestRegisterOTelFlags_ProviderDefault(t *testing.T) { + cmd := makeTestCmd() + cfg := &OTelConfig{} + RegisterOTelFlags(cmd, cfg) + assert.Equal(t, "none", cfg.Provider) +} + +// TestInitOTelProvider_NoneSkipsInit verifies provider=none returns a no-op +// shutdown closure without attempting any network connection. +func TestInitOTelProvider_NoneSkipsInit(t *testing.T) { + cfg := OTelConfig{Provider: "none"} + shutdown, err := InitOTelProvider(t.Context(), cfg) + require.NoError(t, err) + require.NotNil(t, shutdown) + assert.NoError(t, shutdown()) +} + +// TestInitOTelProvider_UnknownProviderReturnsError verifies an unrecognized +// provider string returns a non-nil error containing the bad value. +func TestInitOTelProvider_UnknownProviderReturnsError(t *testing.T) { + cfg := OTelConfig{Provider: "bogusprovider", ServiceName: "test", TracePropagator: "w3c"} + _, err := InitOTelProvider(t.Context(), cfg) + require.Error(t, err) + assert.Contains(t, err.Error(), "bogusprovider") +} + +// TestInitOTelProvider_OtlpGrpc_ValidEndpoint verifies otlpgrpc initializes +// without error. No live collector required — connection errors surface only +// on first export, not at initialization. +func TestInitOTelProvider_OtlpGrpc_ValidEndpoint(t *testing.T) { + cfg := OTelConfig{ + Provider: "otlpgrpc", + Endpoint: "localhost:4317", + ServiceName: "spicedb-test", + TracePropagator: "w3c", + UsePlaintext: true, + SampleRatio: 0.01, + } + shutdown, err := InitOTelProvider(t.Context(), cfg) + require.NoError(t, err) + require.NotNil(t, shutdown) + require.NoError(t, shutdown()) +} + +// TestInitOTelProvider_OtlpHttp_ValidEndpoint verifies otlphttp initializes +// without error. No live collector required. +func TestInitOTelProvider_OtlpHttp_ValidEndpoint(t *testing.T) { + cfg := OTelConfig{ + Provider: "otlphttp", + Endpoint: "localhost:4318", + ServiceName: "spicedb-test", + TracePropagator: "w3c", + UsePlaintext: true, + SampleRatio: 0.01, + } + shutdown, err := InitOTelProvider(t.Context(), cfg) + require.NoError(t, err) + require.NotNil(t, shutdown) + t.Cleanup(func() { _ = shutdown() }) +} + +// TestShutdownOTelProvider_NilProvider_NoError verifies nil provider is safe. +func TestShutdownOTelProvider_NilProvider_NoError(t *testing.T) { + err := ShutdownOTelProvider(t.Context(), nil) + assert.NoError(t, err) +} + +// TestShutdownOTelProvider_CallsFlushThenShutdown verifies ForceFlush is +// called before Shutdown, and both are called exactly once. +func TestShutdownOTelProvider_CallsFlushThenShutdown(t *testing.T) { + callOrder := []string{} + provider := &callOrderShutdowner{callLog: &callOrder} + + err := ShutdownOTelProvider(t.Context(), provider) + require.NoError(t, err) + require.Len(t, callOrder, 2) + assert.Equal(t, "ForceFlush", callOrder[0], "ForceFlush must be called before Shutdown") + assert.Equal(t, "Shutdown", callOrder[1]) +} + +// TestShutdownOTelProvider_ShutdownErrorPropagated verifies that an error +// from Shutdown is returned to the caller. +func TestShutdownOTelProvider_ShutdownErrorPropagated(t *testing.T) { + mock := &mockShutdowner{shutdownErr: fmt.Errorf("shutdown failed")} + err := ShutdownOTelProvider(t.Context(), mock) + require.Error(t, err) + assert.Contains(t, err.Error(), "shutdown failed") +} + +// TestShutdownOTelProvider_ForceFlushErrorContinuesToShutdown verifies that +// a ForceFlush error does not prevent Shutdown from being called. +func TestShutdownOTelProvider_ForceFlushErrorContinuesToShutdown(t *testing.T) { + mock := &mockShutdowner{forceFlushErr: fmt.Errorf("flush failed")} + _ = ShutdownOTelProvider(t.Context(), mock) + assert.True(t, mock.shutdownCalled, "Shutdown must be called even when ForceFlush errors") +} diff --git a/pkg/cmd/server/server.go b/pkg/cmd/server/server.go index ea102870d..edad98446 100644 --- a/pkg/cmd/server/server.go +++ b/pkg/cmd/server/server.go @@ -163,6 +163,9 @@ type Config struct { TelemetryEndpoint string `debugmap:"visible"` TelemetryInterval time.Duration `debugmap:"visible"` + // OpenTelemetry tracing + OTel OTelConfig `debugmap:"visible"` + // Logs EnableRequestLogs bool `debugmap:"visible"` EnableResponseLogs bool `debugmap:"visible"` @@ -538,6 +541,12 @@ func (c *Config) Complete(ctx context.Context) (RunnableServer, error) { } closeables.AddWithoutError(metricsServer.Close) + otelShutdown, err := InitOTelProvider(ctx, c.OTel) + if err != nil { + return nil, fmt.Errorf("initializing OTel provider: %w", err) + } + closeables.AddWithError(otelShutdown) + log.Ctx(ctx).Info().Fields(c.FlatDebugMap()).Msg("configuration") return &completedServerConfig{ diff --git a/pkg/cmd/server/zz_generated.options.go b/pkg/cmd/server/zz_generated.options.go index 4b0cab41a..2ec081be2 100644 --- a/pkg/cmd/server/zz_generated.options.go +++ b/pkg/cmd/server/zz_generated.options.go @@ -111,6 +111,7 @@ func (c *Config) ToOption() ConfigOption { to.TelemetryCAOverridePath = c.TelemetryCAOverridePath to.TelemetryEndpoint = c.TelemetryEndpoint to.TelemetryInterval = c.TelemetryInterval + to.OTel = c.OTel to.EnableRequestLogs = c.EnableRequestLogs to.EnableResponseLogs = c.EnableResponseLogs to.DisableGRPCLatencyHistogram = c.DisableGRPCLatencyHistogram @@ -382,6 +383,13 @@ func (c *Config) DebugMap() map[string]any { } else { debugMap["TelemetryInterval"] = c.TelemetryInterval } + if dm, ok := any(&c.OTel).(interface { + DebugMap() map[string]any + }); ok { + debugMap["OTel"] = dm.DebugMap() + } else { + debugMap["OTel"] = c.OTel + } debugMap["EnableRequestLogs"] = c.EnableRequestLogs debugMap["EnableResponseLogs"] = c.EnableResponseLogs debugMap["DisableGRPCLatencyHistogram"] = c.DisableGRPCLatencyHistogram @@ -999,6 +1007,13 @@ func WithTelemetryInterval(telemetryInterval time.Duration) ConfigOption { } } +// WithOTel returns an option that can set OTel on a Config +func WithOTel(oTel OTelConfig) ConfigOption { + return func(c *Config) { + c.OTel = oTel + } +} + // WithEnableRequestLogs returns an option that can set EnableRequestLogs on a Config func WithEnableRequestLogs(enableRequestLogs bool) ConfigOption { return func(c *Config) { diff --git a/pkg/cmd/server/zz_generated.oteloptions.go b/pkg/cmd/server/zz_generated.oteloptions.go new file mode 100644 index 000000000..e6220a910 --- /dev/null +++ b/pkg/cmd/server/zz_generated.oteloptions.go @@ -0,0 +1,144 @@ +// Code generated by github.com/ecordell/optgen. DO NOT EDIT. +package server + +import defaults "github.com/creasty/defaults" + +type OTelConfigOption func(o *OTelConfig) + +// NewOTelConfigWithOptions creates a new OTelConfig with the passed in options set +func NewOTelConfigWithOptions(opts ...OTelConfigOption) *OTelConfig { + o := &OTelConfig{} + for _, opt := range opts { + opt(o) + } + return o +} + +// NewOTelConfigWithOptionsAndDefaults creates a new OTelConfig with the passed in options set starting from the defaults +func NewOTelConfigWithOptionsAndDefaults(opts ...OTelConfigOption) *OTelConfig { + o := &OTelConfig{} + defaults.MustSet(o) + for _, opt := range opts { + opt(o) + } + return o +} + +// ToOption returns a new OTelConfigOption that sets the values from the passed in OTelConfig +func (o *OTelConfig) ToOption() OTelConfigOption { + return func(to *OTelConfig) { + to.Provider = o.Provider + to.Endpoint = o.Endpoint + to.ServiceName = o.ServiceName + to.TracePropagator = o.TracePropagator + to.UsePlaintext = o.UsePlaintext + to.SampleRatio = o.SampleRatio + } +} + +// DebugMap returns a map form of OTelConfig for debugging +func (o *OTelConfig) DebugMap() map[string]any { + debugMap := map[string]any{} + if o.Provider == "" { + debugMap["Provider"] = "(empty)" + } else { + debugMap["Provider"] = o.Provider + } + if o.Endpoint == "" { + debugMap["Endpoint"] = "(empty)" + } else { + debugMap["Endpoint"] = o.Endpoint + } + if o.ServiceName == "" { + debugMap["ServiceName"] = "(empty)" + } else { + debugMap["ServiceName"] = o.ServiceName + } + if o.TracePropagator == "" { + debugMap["TracePropagator"] = "(empty)" + } else { + debugMap["TracePropagator"] = o.TracePropagator + } + debugMap["UsePlaintext"] = o.UsePlaintext + debugMap["SampleRatio"] = o.SampleRatio + return debugMap +} + +// FlatDebugMap returns a flattened map form of OTelConfig for debugging +// Nested maps are flattened using dot notation (e.g., "parent.child.field") +func (o *OTelConfig) FlatDebugMap() map[string]any { + var flatten func(m map[string]any) map[string]any + flatten = func(m map[string]any) map[string]any { + result := make(map[string]any, len(m)) + for key, value := range m { + childMap, ok := value.(map[string]any) + if ok { + for childKey, childValue := range flatten(childMap) { + result[key+"."+childKey] = childValue + } + continue + } + result[key] = value + } + return result + } + return flatten(o.DebugMap()) +} + +// OTelConfigWithOptions configures an existing OTelConfig with the passed in options set +func OTelConfigWithOptions(o *OTelConfig, opts ...OTelConfigOption) *OTelConfig { + for _, opt := range opts { + opt(o) + } + return o +} + +// WithOptions configures the receiver OTelConfig with the passed in options set +func (o *OTelConfig) WithOptions(opts ...OTelConfigOption) *OTelConfig { + for _, opt := range opts { + opt(o) + } + return o +} + +// WithProvider returns an option that can set Provider on a OTelConfig +func WithProvider(provider string) OTelConfigOption { + return func(o *OTelConfig) { + o.Provider = provider + } +} + +// WithEndpoint returns an option that can set Endpoint on a OTelConfig +func WithEndpoint(endpoint string) OTelConfigOption { + return func(o *OTelConfig) { + o.Endpoint = endpoint + } +} + +// WithServiceName returns an option that can set ServiceName on a OTelConfig +func WithServiceName(serviceName string) OTelConfigOption { + return func(o *OTelConfig) { + o.ServiceName = serviceName + } +} + +// WithTracePropagator returns an option that can set TracePropagator on a OTelConfig +func WithTracePropagator(tracePropagator string) OTelConfigOption { + return func(o *OTelConfig) { + o.TracePropagator = tracePropagator + } +} + +// WithUsePlaintext returns an option that can set UsePlaintext on a OTelConfig +func WithUsePlaintext(usePlaintext bool) OTelConfigOption { + return func(o *OTelConfig) { + o.UsePlaintext = usePlaintext + } +} + +// WithSampleRatio returns an option that can set SampleRatio on a OTelConfig +func WithSampleRatio(sampleRatio float64) OTelConfigOption { + return func(o *OTelConfig) { + o.SampleRatio = sampleRatio + } +} diff --git a/pkg/cmd/util/util.go b/pkg/cmd/util/util.go index f7c605846..6d0f9a620 100644 --- a/pkg/cmd/util/util.go +++ b/pkg/cmd/util/util.go @@ -13,7 +13,6 @@ import ( "net/http" "time" - "github.com/jzelinskie/cobrautil/v2/cobraotel" _ "github.com/mostynb/go-grpc-compression/experimental/s2" // Register Snappy S2 compression "github.com/rs/zerolog" "github.com/spf13/cobra" @@ -439,8 +438,6 @@ func (d *disabledHTTPServer) Close() {} // so that they were shared across all commands, but this // made it difficult to organize the flags, so we lifted them here. func RegisterCommonFlags(cmd *cobra.Command) { - otel := cobraotel.New("spicedb") - otel.RegisterFlags(cmd.Flags()) termination.RegisterFlags(cmd.Flags()) runtime.RegisterFlags(cmd.Flags()) }