Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ require (
github.com/wasilibs/go-pgquery v0.0.0-20250409022910-10ac41983c07
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0
go.opentelemetry.io/contrib/propagators/b3 v1.20.0
go.opentelemetry.io/contrib/propagators/ot v1.20.0
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
go.opentelemetry.io/otel/exporters/prometheus v0.62.0
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/sdk/metric v1.43.0
Expand Down Expand Up @@ -252,11 +256,7 @@ require (
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/detectors/gcp v1.43.0 // indirect
go.opentelemetry.io/contrib/propagators/b3 v1.20.0 // indirect
go.opentelemetry.io/contrib/propagators/ot v1.20.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect
go.opentelemetry.io/otel/metric v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
go.uber.org/automaxprocs v1.6.0 // indirect
Expand Down
20 changes: 13 additions & 7 deletions pkg/cmd/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import (

"github.com/fatih/color"
"github.com/jzelinskie/cobrautil/v2"
"github.com/jzelinskie/cobrautil/v2/cobraotel"
"github.com/spf13/cobra"

log "github.com/authzed/spicedb/internal/logging"
"github.com/authzed/spicedb/internal/telemetry"
"github.com/authzed/spicedb/pkg/cmd/datastore"
"github.com/authzed/spicedb/pkg/cmd/server"
Expand Down Expand Up @@ -219,11 +219,8 @@ func RegisterServeFlags(cmd *cobra.Command, config *server.Config) error {
return fmt.Errorf("could not register stored schema cache flags: %w", err)
}

tracingFlags := nfs.FlagSet(BoldBlue("Tracing"))
// Flags for tracing
// NOTE: cobraotel.New takes service name as an arg rather than command name.
otel := cobraotel.New("spicedb")
otel.RegisterFlags(tracingFlags)
server.RegisterOTelFlags(cmd)

loggingFlagSet := nfs.FlagSet(BoldBlue("Logging"))
loggingFlagSet.BoolVar(&config.EnableRequestLogs, "grpc-log-requests-enabled", false, "enable logging of API request payloads")
Expand Down Expand Up @@ -265,14 +262,23 @@ func NewServeCommand(programName string, config *server.Config) *cobra.Command {
Long: "start a SpiceDB server",
PreRunE: server.DefaultPreRunE(programName),
RunE: termination.PublishError(func(cmd *cobra.Command, args []string) error {
server, err := config.Complete(cmd.Context())
srv, err := config.Complete(cmd.Context())
if err != nil {
return err
}
signalctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()

return server.Run(signalctx)
defer func() {
// Shutdown OTel provider to ensure all traces are flushed
if provider := server.OTelProviderFromContext(cmd.Context()); provider != nil {
if err := server.ShutdownOTelProvider(context.Background(), provider); err != nil {
log.Warn().Err(err).Msg("failed to cleanly shutdown OpenTelemetry provider")
}
}
}()

return srv.Run(signalctx)
}),
Example: server.ServeExample(programName),
}
Expand Down
5 changes: 1 addition & 4 deletions pkg/cmd/server/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
grpclog "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging"
"github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/selector"
"github.com/jzelinskie/cobrautil/v2"
"github.com/jzelinskie/cobrautil/v2/cobraotel"
"github.com/jzelinskie/cobrautil/v2/cobraproclimits"
"github.com/jzelinskie/cobrautil/v2/cobrazerolog"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -82,9 +81,7 @@ func DefaultPreRunE(programName string) cobrautil.CobraRunFunc {
// and zero under the same load and 0.9
cobraproclimits.SetMemLimitRunE(memlimit.WithRatio(0.9)),
cobraproclimits.SetProcLimitRunE(),
cobraotel.New("spicedb",
cobraotel.WithLogger(zerologr.New(&logging.Logger)),
).RunE(),
OTelPreRunE,
releases.CheckAndLogRunE(),
runtime.RunE(),
)
Expand Down
252 changes: 252 additions & 0 deletions pkg/cmd/server/otel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
// pkg/cmd/server/otel.go
//
// This file replicates the OpenTelemetry provider initialization previously
// provided by github.com/jzelinskie/cobrautil/v2/cobraotel.
//
// Motivation:
// - Issue #712: otel-provider defaults to "none" so OTEL_* env vars alone
// cannot activate tracing. Native ownership lets us address this properly.
// - Issue #3095: cobraotel owns the TracerProvider with no way for the
// signal handler to call Shutdown/ForceFlush. Traces are dropped on
// SIGTERM. Native ownership closes this lifecycle gap.

package server

import (
"context"
"fmt"
"strings"
"time"

"github.com/spf13/cobra"
"go.opentelemetry.io/contrib/propagators/b3"
"go.opentelemetry.io/contrib/propagators/ot"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.7.0"
)

// otelProviderContextKey is the unexported context key for the TracerProvider.
type otelProviderContextKey struct{}

// OTelShutdownTimeout is the maximum time for flushing spans on shutdown.
const OTelShutdownTimeout = 30 * time.Second

// otelShutdowner is the interface satisfied by *sdktrace.TracerProvider.
// Using a local interface instead of the concrete type makes
// ShutdownOTelProvider testable with a mock without importing sdktrace.
type otelShutdowner interface {
Shutdown(ctx context.Context) error
ForceFlush(ctx context.Context) error
}

// RegisterOTelFlags registers all OpenTelemetry flags on cmd.
// The flags registered here are identical in name and default value to those
// previously registered by cobraotel.RegisterFlags.
func RegisterOTelFlags(cmd *cobra.Command) {
f := cmd.Flags()
f.String("otel-provider", "none",
`OpenTelemetry provider for tracing ("none", "otlpgrpc", "otlphttp")`)
f.String("otel-endpoint", "",
`OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables`)
f.String("otel-service-name", "spicedb",
`service name for trace data`)
f.String("otel-trace-propagator", "w3c",
`OpenTelemetry trace propagation format ("b3", "w3c", "ottrace"). Add multiple propagators separated by comma.`)
f.Bool("otel-insecure", false,
`connect to the OpenTelemetry collector in plaintext`)
f.StringToString("otel-headers", nil,
`key=value pairs sent as headers to the OTel provider`)
f.Float64("otel-sample-ratio", 0.01,
`ratio of traces that are sampled`)

// Legacy flags
f.String("otel-jaeger-endpoint", "", "OpenTelemetry collector endpoint - the endpoint can also be set by using enviroment variables")
_ = f.MarkHidden("otel-jaeger-endpoint")
f.String("otel-jaeger-service-name", "spicedb", "service name for trace data")
_ = f.MarkHidden("otel-jaeger-service-name")
}

// InitOTelProvider reads otel-* flags from cmd, builds a TracerProvider, sets
// it as the global OTel provider, and returns it for lifecycle management.
//
// Returns (nil, nil) when otel-provider is "none". Callers must handle nil.
func InitOTelProvider(cmd *cobra.Command) (otelShutdowner, error) {
flags := cmd.Flags()

// cobra commands may have no context when called outside Execute()
ctx := cmd.Context()
if ctx == nil {
ctx = context.Background()
}

providerName, err := flags.GetString("otel-provider")
if err != nil {
return nil, fmt.Errorf("reading otel-provider: %w", err)
}
providerName = strings.TrimSpace(strings.ToLower(providerName))

if providerName == "none" || providerName == "" {
return nil, nil
}

endpoint, err := flags.GetString("otel-endpoint")
if err != nil {
return nil, fmt.Errorf("reading otel-endpoint: %w", err)
}

serviceName, err := flags.GetString("otel-service-name")
if err != nil {
return nil, fmt.Errorf("reading otel-service-name: %w", err)
}

propagatorName, err := flags.GetString("otel-trace-propagator")
if err != nil {
return nil, fmt.Errorf("reading otel-trace-propagator: %w", err)
}

insecureConn, err := flags.GetBool("otel-insecure")
if err != nil {
return nil, fmt.Errorf("reading otel-insecure: %w", err)
}

headers, err := flags.GetStringToString("otel-headers")
if err != nil {
return nil, fmt.Errorf("reading otel-headers: %w", err)
}

sampleRatio, err := flags.GetFloat64("otel-sample-ratio")
if err != nil {
return nil, fmt.Errorf("reading otel-sample-ratio: %w", err)
}

res, err := resource.New(ctx,
resource.WithAttributes(semconv.ServiceNameKey.String(serviceName)),
resource.WithProcess(),
resource.WithOS(),
resource.WithHost(),
)
if err != nil {
return nil, fmt.Errorf("building OTel resource: %w", err)
}

var exporter sdktrace.SpanExporter

switch providerName {
case "otlpgrpc":
opts := []otlptracegrpc.Option{}
if endpoint != "" {
opts = append(opts, otlptracegrpc.WithEndpoint(endpoint))
}
if insecureConn {
opts = append(opts, otlptracegrpc.WithInsecure())
}
if len(headers) > 0 {
opts = append(opts, otlptracegrpc.WithHeaders(headers))
}
exp, err := otlptracegrpc.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("creating otlpgrpc exporter: %w", err)
}
exporter = exp

case "otlphttp":
opts := []otlptracehttp.Option{}
if endpoint != "" {
opts = append(opts, otlptracehttp.WithEndpoint(endpoint))
}
if insecureConn {
opts = append(opts, otlptracehttp.WithInsecure())
}
if len(headers) > 0 {
opts = append(opts, otlptracehttp.WithHeaders(headers))
}
exp, err := otlptracehttp.New(ctx, opts...)
if err != nil {
return nil, fmt.Errorf("creating otlphttp exporter: %w", err)
}
exporter = exp

default:
return nil, fmt.Errorf(
"unknown otel-provider %q: must be one of: none, otlpgrpc, otlphttp",
providerName,
)
}

tp := sdktrace.NewTracerProvider(
sdktrace.WithSampler(sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRatio))),
sdktrace.WithBatcher(exporter),
sdktrace.WithResource(res),
)

otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(buildPropagator(propagatorName))

return tp, nil
}

// ShutdownOTelProvider flushes all pending spans then shuts the provider down.
// ForceFlush is always called before Shutdown. Safe to call with nil (no-op).
func ShutdownOTelProvider(ctx context.Context, provider otelShutdowner) error {
if provider == nil {
return nil
}

flushCtx, flushCancel := context.WithTimeout(ctx, OTelShutdownTimeout)
defer flushCancel()
if err := provider.ForceFlush(flushCtx); err != nil {
// Log but continue — Shutdown must still be attempted.
fmt.Printf("otel: ForceFlush error (continuing to Shutdown): %v\n", err)
}

shutCtx, shutCancel := context.WithTimeout(ctx, OTelShutdownTimeout)
defer shutCancel()
return provider.Shutdown(shutCtx)
}

// OTelPreRunE is a cobra.PreRunE function that initializes the OTel provider
// and stores it on the command context so the shutdown handler can retrieve it.
func OTelPreRunE(cmd *cobra.Command, _ []string) error {
provider, err := InitOTelProvider(cmd)
if err != nil {
return fmt.Errorf("initializing OTel provider: %w", err)
}
parent := cmd.Context()
if parent == nil {
parent = context.Background()
}
ctx := context.WithValue(parent, otelProviderContextKey{}, provider)
cmd.SetContext(ctx)
return nil
}

// OTelProviderFromContext retrieves the otelShutdowner stored by OTelPreRunE.
// Returns nil if no provider was initialized (e.g. otel-provider was "none").
func OTelProviderFromContext(ctx context.Context) otelShutdowner {
v, _ := ctx.Value(otelProviderContextKey{}).(otelShutdowner)
return v
}

// buildPropagator returns the TextMapPropagator for the given name.
func buildPropagator(names string) propagation.TextMapPropagator {
var tmPropagators []propagation.TextMapPropagator
for _, p := range strings.Split(names, ",") {
switch strings.ToLower(strings.TrimSpace(p)) {
case "b3":
tmPropagators = append(tmPropagators, b3.New())
case "ottrace":
tmPropagators = append(tmPropagators, ot.OT{})
case "w3c":
fallthrough
default:
tmPropagators = append(tmPropagators, propagation.Baggage{})
tmPropagators = append(tmPropagators, propagation.TraceContext{})
}
}
return propagation.NewCompositeTextMapPropagator(tmPropagators...)
}
Loading
Loading