From aad1fee1626c20e928c053d4ae06985b5848f1df Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Fri, 15 Aug 2025 15:03:50 +0300 Subject: [PATCH 01/11] rename overrides to overridesConfig Signed-off-by: Bogdan Stancu --- pkg/cortex/cortex.go | 2 +- pkg/cortex/modules.go | 92 +++++++++++++++++++++---------------------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go index f50fcf26e17..0c6fd01baa1 100644 --- a/pkg/cortex/cortex.go +++ b/pkg/cortex/cortex.go @@ -313,7 +313,7 @@ type Cortex struct { Server *server.Server Ring *ring.Ring TenantLimits validation.TenantLimits - Overrides *validation.Overrides + OverridesConfig *validation.Overrides Distributor *distributor.Distributor Ingester *ingester.Ingester Flusher *flusher.Flusher diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index a47888b8267..46a13cd42cd 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -67,7 +67,7 @@ const ( API string = "api" Ring string = "ring" RuntimeConfig string = "runtime-config" - Overrides string = "overrides" + OverridesConfig string = "overrides-config" OverridesExporter string = "overrides-exporter" Server string = "server" Distributor string = "distributor" @@ -200,8 +200,8 @@ func (t *Cortex) initRuntimeConfig() (services.Service, error) { return serv, err } -func (t *Cortex) initOverrides() (services.Service, error) { - t.Overrides = validation.NewOverrides(t.Cfg.LimitsConfig, t.TenantLimits) +func (t *Cortex) initOverridesConfig() (services.Service, error) { + t.OverridesConfig = validation.NewOverrides(t.Cfg.LimitsConfig, t.TenantLimits) // overrides don't have operational state, nor do they need to do anything more in starting/stopping phase, // so there is no need to return any service. return nil, nil @@ -233,7 +233,7 @@ func (t *Cortex) initDistributorService() (serv services.Service, err error) { // ruler's dependency) canJoinDistributorsRing := t.Cfg.isModuleEnabled(Distributor) || t.Cfg.isModuleEnabled(All) - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.Ring, canJoinDistributorsRing, prometheus.DefaultRegisterer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.OverridesConfig, t.Ring, canJoinDistributorsRing, prometheus.DefaultRegisterer, util_log.Logger) if err != nil { return } @@ -255,7 +255,7 @@ func (t *Cortex) initGrpcClientServices() (serv services.Service, err error) { } func (t *Cortex) initDistributor() (serv services.Service, err error) { - t.API.RegisterDistributor(t.Distributor, t.Cfg.Distributor, t.Overrides) + t.API.RegisterDistributor(t.Distributor, t.Cfg.Distributor, t.OverridesConfig) return nil, nil } @@ -266,7 +266,7 @@ func (t *Cortex) initQueryable() (serv services.Service, err error) { querierRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "querier"}, prometheus.DefaultRegisterer) // Create a querier queryable and PromQL engine - t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger, t.Overrides.QueryPartialData) + t.QuerierQueryable, t.ExemplarQueryable, t.QuerierEngine = querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, querierRegisterer, util_log.Logger, t.OverridesConfig.QueryPartialData) // Use distributor as default MetadataQuerier t.MetadataQuerier = t.Distributor @@ -422,12 +422,12 @@ func (t *Cortex) initStoreQueryables() (services.Service, error) { //nolint:revive // I prefer this form over removing 'else', because it allows q to have smaller scope. var queriable prom_storage.Queryable - if q, err := initBlockStoreQueryable(t.Cfg, t.Overrides, prometheus.DefaultRegisterer); err != nil { + if q, err := initBlockStoreQueryable(t.Cfg, t.OverridesConfig, prometheus.DefaultRegisterer); err != nil { return nil, fmt.Errorf("failed to initialize querier: %v", err) } else { queriable = q if t.Cfg.Querier.EnableParquetQueryable { - pq, err := querier.NewParquetQueryable(t.Cfg.Querier, t.Cfg.BlocksStorage, t.Overrides, q, util_log.Logger, prometheus.DefaultRegisterer) + pq, err := querier.NewParquetQueryable(t.Cfg.Querier, t.Cfg.BlocksStorage, t.OverridesConfig, q, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return nil, fmt.Errorf("failed to initialize parquet querier: %v", err) } @@ -477,7 +477,7 @@ func (t *Cortex) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.QueryIngestersWithin = t.Cfg.Querier.QueryIngestersWithin t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, prometheus.DefaultRegisterer, util_log.Logger, t.ResourceMonitor) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.OverridesConfig, prometheus.DefaultRegisterer, util_log.Logger, t.ResourceMonitor) if err != nil { return } @@ -497,7 +497,7 @@ func (t *Cortex) initFlusher() (serv services.Service, err error) { t.Flusher, err = flusher.New( t.Cfg.Flusher, t.Cfg.Ingester, - t.Overrides, + t.OverridesConfig, prometheus.DefaultRegisterer, util_log.Logger, ) @@ -532,7 +532,7 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro queryRangeMiddlewares, cache, err := queryrange.Middlewares( t.Cfg.QueryRange, util_log.Logger, - t.Overrides, + t.OverridesConfig, queryrange.PrometheusResponseExtractor{}, prometheus.DefaultRegisterer, queryAnalyzer, @@ -548,7 +548,7 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro instantQueryMiddlewares, err := instantquery.Middlewares( util_log.Logger, - t.Overrides, + t.OverridesConfig, instantQueryCodec, queryAnalyzer, t.Cfg.Querier.LookbackDelta, @@ -565,7 +565,7 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro instantQueryMiddlewares, prometheusCodec, instantQueryCodec, - t.Overrides, + t.OverridesConfig, queryAnalyzer, t.Cfg.Querier.DefaultEvaluationInterval, t.Cfg.Querier.MaxSubQuerySteps, @@ -583,7 +583,7 @@ func (t *Cortex) initQueryFrontendTripperware() (serv services.Service, err erro func (t *Cortex) initQueryFrontend() (serv services.Service, err error) { retry := transport.NewRetry(t.Cfg.QueryRange.MaxRetries, prometheus.DefaultRegisterer) - roundTripper, frontendV1, frontendV2, err := frontend.InitFrontend(t.Cfg.Frontend, t.Overrides, t.Cfg.Server.GRPCListenPort, util_log.Logger, prometheus.DefaultRegisterer, retry) + roundTripper, frontendV1, frontendV2, err := frontend.InitFrontend(t.Cfg.Frontend, t.OverridesConfig, t.Cfg.Server.GRPCListenPort, util_log.Logger, prometheus.DefaultRegisterer, retry) if err != nil { return nil, err } @@ -618,7 +618,7 @@ func (t *Cortex) initRulerStorage() (serv services.Service, err error) { return } - t.RulerStorage, err = ruler.NewRuleStore(context.Background(), t.Cfg.RulerStorage, t.Overrides, rules.FileLoader{}, util_log.Logger, prometheus.DefaultRegisterer) + t.RulerStorage, err = ruler.NewRuleStore(context.Background(), t.Cfg.RulerStorage, t.OverridesConfig, rules.FileLoader{}, util_log.Logger, prometheus.DefaultRegisterer) return } @@ -664,15 +664,15 @@ func (t *Cortex) initRuler() (serv services.Service, err error) { } queryEngine := engine.New(opts, t.Cfg.Ruler.ThanosEngine, rulerRegisterer) - managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Cfg.ExternalPusher, t.Cfg.ExternalQueryable, queryEngine, t.Overrides, metrics, prometheus.DefaultRegisterer) - manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, t.Overrides, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger) + managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Cfg.ExternalPusher, t.Cfg.ExternalQueryable, queryEngine, t.OverridesConfig, metrics, prometheus.DefaultRegisterer) + manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, t.OverridesConfig, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger) } else { rulerRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "ruler"}, prometheus.DefaultRegisterer) // TODO: Consider wrapping logger to differentiate from querier module logger - queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger, t.Overrides.RulesPartialData) + queryable, _, engine := querier.New(t.Cfg.Querier, t.OverridesConfig, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger, t.OverridesConfig.RulesPartialData) - managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, metrics, prometheus.DefaultRegisterer) - manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, t.Overrides, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger) + managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.OverridesConfig, metrics, prometheus.DefaultRegisterer) + manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, t.OverridesConfig, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger) } if err != nil { @@ -685,7 +685,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) { prometheus.DefaultRegisterer, util_log.Logger, t.RulerStorage, - t.Overrides, + t.OverridesConfig, ) if err != nil { return @@ -720,12 +720,12 @@ func (t *Cortex) initAlertManager() (serv services.Service, err error) { t.Cfg.Alertmanager.ShardingRing.ListenPort = t.Cfg.Server.GRPCListenPort // Initialise the store. - store, err := alertstore.NewAlertStore(context.Background(), t.Cfg.AlertmanagerStorage, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + store, err := alertstore.NewAlertStore(context.Background(), t.Cfg.AlertmanagerStorage, t.OverridesConfig, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return } - t.Alertmanager, err = alertmanager.NewMultitenantAlertmanager(&t.Cfg.Alertmanager, store, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + t.Alertmanager, err = alertmanager.NewMultitenantAlertmanager(&t.Cfg.Alertmanager, store, t.OverridesConfig, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return } @@ -736,14 +736,14 @@ func (t *Cortex) initAlertManager() (serv services.Service, err error) { func (t *Cortex) initParquetConverter() (serv services.Service, err error) { t.Cfg.ParquetConverter.Ring.ListenPort = t.Cfg.Server.GRPCListenPort - return parquetconverter.NewConverter(t.Cfg.ParquetConverter, t.Cfg.BlocksStorage, t.Cfg.Compactor.BlockRanges.ToMilliseconds(), util_log.Logger, prometheus.DefaultRegisterer, t.Overrides) + return parquetconverter.NewConverter(t.Cfg.ParquetConverter, t.Cfg.BlocksStorage, t.Cfg.Compactor.BlockRanges.ToMilliseconds(), util_log.Logger, prometheus.DefaultRegisterer, t.OverridesConfig) } func (t *Cortex) initCompactor() (serv services.Service, err error) { t.Cfg.Compactor.ShardingRing.ListenPort = t.Cfg.Server.GRPCListenPort ingestionReplicationFactor := t.Cfg.Ingester.LifecyclerConfig.RingConfig.ReplicationFactor - t.Compactor, err = compactor.NewCompactor(t.Cfg.Compactor, t.Cfg.BlocksStorage, util_log.Logger, prometheus.DefaultRegisterer, t.Overrides, ingestionReplicationFactor) + t.Compactor, err = compactor.NewCompactor(t.Cfg.Compactor, t.Cfg.BlocksStorage, util_log.Logger, prometheus.DefaultRegisterer, t.OverridesConfig, ingestionReplicationFactor) if err != nil { return } @@ -756,7 +756,7 @@ func (t *Cortex) initCompactor() (serv services.Service, err error) { func (t *Cortex) initStoreGateway() (serv services.Service, err error) { t.Cfg.StoreGateway.ShardingRing.ListenPort = t.Cfg.Server.GRPCListenPort - t.StoreGateway, err = storegateway.NewStoreGateway(t.Cfg.StoreGateway, t.Cfg.BlocksStorage, t.Overrides, t.Cfg.Server.LogLevel, util_log.Logger, prometheus.DefaultRegisterer, t.ResourceMonitor) + t.StoreGateway, err = storegateway.NewStoreGateway(t.Cfg.StoreGateway, t.Cfg.BlocksStorage, t.OverridesConfig, t.Cfg.Server.LogLevel, util_log.Logger, prometheus.DefaultRegisterer, t.ResourceMonitor) if err != nil { return nil, err } @@ -798,7 +798,7 @@ func (t *Cortex) initMemberlistKV() (services.Service, error) { func (t *Cortex) initTenantDeletionAPI() (services.Service, error) { // t.RulerStorage can be nil when running in single-binary mode, and rule storage is not configured. - tenantDeletionAPI, err := purger.NewTenantDeletionAPI(t.Cfg.BlocksStorage, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + tenantDeletionAPI, err := purger.NewTenantDeletionAPI(t.Cfg.BlocksStorage, t.OverridesConfig, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return nil, err } @@ -813,7 +813,7 @@ func (t *Cortex) initQueryScheduler() (services.Service, error) { tenant.WithDefaultResolver(tenantfederation.NewRegexValidator()) } - s, err := scheduler.NewScheduler(t.Cfg.QueryScheduler, t.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + s, err := scheduler.NewScheduler(t.Cfg.QueryScheduler, t.OverridesConfig, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return nil, errors.Wrap(err, "query-scheduler init") } @@ -857,7 +857,7 @@ func (t *Cortex) setupModuleManager() error { mm.RegisterModule(RuntimeConfig, t.initRuntimeConfig, modules.UserInvisibleModule) mm.RegisterModule(MemberlistKV, t.initMemberlistKV, modules.UserInvisibleModule) mm.RegisterModule(Ring, t.initRing, modules.UserInvisibleModule) - mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) + mm.RegisterModule(OverridesConfig, t.initOverridesConfig, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) @@ -889,33 +889,33 @@ func (t *Cortex) setupModuleManager() error { MemberlistKV: {API}, RuntimeConfig: {API}, Ring: {API, RuntimeConfig, MemberlistKV}, - Overrides: {RuntimeConfig}, + OverridesConfig: {RuntimeConfig}, OverridesExporter: {RuntimeConfig}, Distributor: {DistributorService, API, GrpcClientService}, - DistributorService: {Ring, Overrides}, - Ingester: {IngesterService, Overrides, API}, - IngesterService: {Overrides, RuntimeConfig, MemberlistKV, ResourceMonitor}, - Flusher: {Overrides, API}, - Queryable: {Overrides, DistributorService, Overrides, Ring, API, StoreQueryable, MemberlistKV}, + DistributorService: {Ring, OverridesConfig}, + Ingester: {IngesterService, OverridesConfig, API}, + IngesterService: {OverridesConfig, RuntimeConfig, MemberlistKV, ResourceMonitor}, + Flusher: {OverridesConfig, API}, + Queryable: {OverridesConfig, DistributorService, OverridesConfig, Ring, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation}, - StoreQueryable: {Overrides, Overrides, MemberlistKV, GrpcClientService}, - QueryFrontendTripperware: {API, Overrides}, + StoreQueryable: {OverridesConfig, OverridesConfig, MemberlistKV, GrpcClientService}, + QueryFrontendTripperware: {API, OverridesConfig}, QueryFrontend: {QueryFrontendTripperware}, - QueryScheduler: {API, Overrides}, - Ruler: {DistributorService, Overrides, StoreQueryable, RulerStorage}, - RulerStorage: {Overrides}, + QueryScheduler: {API, OverridesConfig}, + Ruler: {DistributorService, OverridesConfig, StoreQueryable, RulerStorage}, + RulerStorage: {OverridesConfig}, Configs: {API}, - AlertManager: {API, MemberlistKV, Overrides}, - Compactor: {API, MemberlistKV, Overrides}, - ParquetConverter: {API, MemberlistKV, Overrides}, - StoreGateway: {API, Overrides, MemberlistKV, ResourceMonitor}, - TenantDeletion: {API, Overrides}, + AlertManager: {API, MemberlistKV, OverridesConfig}, + Compactor: {API, MemberlistKV, OverridesConfig}, + ParquetConverter: {API, MemberlistKV, OverridesConfig}, + StoreGateway: {API, OverridesConfig, MemberlistKV, ResourceMonitor}, + TenantDeletion: {API, OverridesConfig}, Purger: {TenantDeletion}, TenantFederation: {Queryable}, All: {QueryFrontend, Querier, Ingester, Distributor, Purger, StoreGateway, Ruler, Compactor, AlertManager}, } if t.Cfg.ExternalPusher != nil && t.Cfg.ExternalQueryable != nil { - deps[Ruler] = []string{Overrides, RulerStorage} + deps[Ruler] = []string{OverridesConfig, RulerStorage} } for mod, targets := range deps { if err := mm.AddDependency(mod, targets...); err != nil { From 60d09fbab68667b665aead274be003fb93088745 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Fri, 15 Aug 2025 21:06:14 +0300 Subject: [PATCH 02/11] initial overrides api draft Signed-off-by: Bogdan Stancu --- CHANGELOG.md | 1 + docs/api/_index.md | 61 +++ docs/configuration/config-file-reference.md | 251 +++++++++ integration/overrides_test.go | 323 ++++++++++++ pkg/api/api.go | 12 + pkg/cortex/cortex.go | 4 + pkg/cortex/modules.go | 18 +- pkg/overrides/api.go | 179 +++++++ pkg/overrides/limits.go | 132 +++++ pkg/overrides/overrides.go | 204 +++++++ pkg/overrides/overrides_test.go | 556 ++++++++++++++++++++ 11 files changed, 1740 insertions(+), 1 deletion(-) create mode 100644 integration/overrides_test.go create mode 100644 pkg/overrides/api.go create mode 100644 pkg/overrides/limits.go create mode 100644 pkg/overrides/overrides.go create mode 100644 pkg/overrides/overrides_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index f05aaf715f2..c155b01edf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] StoreGateway/Alertmanager: Add default 5s connection timeout on client. #6603 * [CHANGE] Ingester: Remove EnableNativeHistograms config flag and instead gate keep through new per-tenant limit at ingestion. #6718 * [CHANGE] Validate a tenantID when to use a single tenant resolver. #6727 +* [FEATURE] Add new Overrides API module. Rename old overrides module to overrides-configs. * [FEATURE] Distributor: Add an experimental `-distributor.otlp.allow-delta-temporality` flag to ingest delta temporality otlp metrics. #6934 * [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458 * [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526 diff --git a/docs/api/_index.md b/docs/api/_index.md index 64a6aab3f0c..5e04828f52e 100644 --- a/docs/api/_index.md +++ b/docs/api/_index.md @@ -66,6 +66,9 @@ For the sake of clarity, in this document we have grouped API endpoints by servi | [Delete Alertmanager configuration](#delete-alertmanager-configuration) | Alertmanager || `DELETE /api/v1/alerts` | | [Tenant delete request](#tenant-delete-request) | Purger || `POST /purger/delete_tenant` | | [Tenant delete status](#tenant-delete-status) | Purger || `GET /purger/delete_tenant_status` | +| [Get user overrides](#get-user-overrides) | Overrides || `GET /api/v1/user-overrides` | +| [Set user overrides](#set-user-overrides) | Overrides || `PUT /api/v1/user-overrides` | +| [Delete user overrides](#delete-user-overrides) | Overrides || `DELETE /api/v1/user-overrides` | | [Store-gateway ring status](#store-gateway-ring-status) | Store-gateway || `GET /store-gateway/ring` | | [Compactor ring status](#compactor-ring-status) | Compactor || `GET /compactor/ring` | | [Get rule files](#get-rule-files) | Configs API (deprecated) || `GET /api/prom/configs/rules` | @@ -872,6 +875,64 @@ Returns status of tenant deletion. Output format to be defined. Experimental. _Requires [authentication](#authentication)._ +## Overrides + +The Overrides service provides an API for managing user overrides. + +### Get user overrides + +``` +GET /api/v1/user-overrides +``` + +Get the current overrides for the authenticated tenant. Returns the overrides in JSON format. + +_Requires [authentication](#authentication)._ + +### Set user overrides + +``` +PUT /api/v1/user-overrides +``` + +Set or update overrides for the authenticated tenant. The request body should contain a JSON object with the override values. + +_Requires [authentication](#authentication)._ + +### Delete user overrides + +``` +DELETE /api/v1/user-overrides +``` + +Delete all overrides for the authenticated tenant. This will revert the tenant to using default values. + +_Requires [authentication](#authentication)._ + +#### Example request body for PUT + +```json +{ + "ingestion_rate": 50000, + "max_global_series_per_user": 1000000, + "ruler_max_rules_per_rule_group": 100 +} +``` + +#### Supported limits + +The following limits can be modified via the API: +- `max_global_series_per_user` +- `max_global_series_per_metric` +- `ingestion_rate` +- `ingestion_burst_size` +- `ruler_max_rules_per_rule_group` +- `ruler_max_rule_groups_per_tenant` + +#### Hard limits + +Overrides are validated against hard limits defined in the runtime configuration file. If a requested override exceeds the hard limit for the tenant, the request will be rejected with a 400 status code. + ## Store-gateway ### Store-gateway ring status diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index c4ef8305ed6..ed68e4ea184 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -5465,6 +5465,257 @@ thanos_engine: [optimizers: | default = "default"] ``` +### `overrides` + +The `overrides` configures the Cortex overrides API for managing user overrides. + +```yaml +# Enable the overrides module. +# CLI flag: -overrides.enabled +[enabled: | default = false] + +# Path to the runtime configuration file. +# CLI flag: -overrides.runtime-config-file +[runtime_config_file: | default = "runtime.yaml"] + +# Backend storage to use. Supported backends are: s3, gcs, azure, swift. +# CLI flag: -overrides.backend +[backend: | default = "s3"] + +s3: + # The S3 bucket endpoint. It could be an AWS S3 endpoint listed at + # https://docs.aws.amazon.com/general/latest/gr/s3.html or the address of an + # S3-compatible service in hostname:port format. + # CLI flag: -overrides.s3.endpoint + [endpoint: | default = ""] + + # S3 region. If unset, the client will issue a S3 GetBucketLocation API call + # to autodetect it. + # CLI flag: -overrides.s3.region + [region: | default = ""] + + # S3 bucket name + # CLI flag: -overrides.s3.bucket-name + [bucket_name: | default = ""] + + # S3 secret access key + # CLI flag: -overrides.s3.secret-access-key + [secret_access_key: | default = ""] + + # S3 access key ID + # CLI flag: -overrides.s3.access-key-id + [access_key_id: | default = ""] + + # If enabled, use http:// for the S3 endpoint instead of https://. This could + # be useful in local dev/test environments while using an S3-compatible + # backend storage, like Minio. + # CLI flag: -overrides.s3.insecure + [insecure: | default = false] + + # The signature version to use for authenticating against S3. Supported values + # are: v4, v2. + # CLI flag: -overrides.s3.signature-version + [signature_version: | default = "v4"] + + # The s3 bucket lookup style. Supported values are: auto, virtual-hosted, + # path. + # CLI flag: -overrides.s3.bucket-lookup-type + [bucket_lookup_type: | default = "auto"] + + # If true, attach MD5 checksum when upload objects and S3 uses MD5 checksum + # algorithm to verify the provided digest. If false, use CRC32C algorithm + # instead. + # CLI flag: -overrides.s3.send-content-md5 + [send_content_md5: | default = true] + + http: + # The time an idle connection will remain idle before closing. + # CLI flag: -overrides.s3.http.idle-conn-timeout + [idle_conn_timeout: | default = 1m30s] + + # The amount of time the client will wait for a servers response headers. + # CLI flag: -overrides.s3.http.response-header-timeout + [response_header_timeout: | default = 2m] + + # Maximum time to wait for a TLS handshake. 0 means no limit. + # CLI flag: -overrides.s3.tls-handshake-timeout + [tls_handshake_timeout: | default = 10s] + + # The time to wait for a server's first response headers after fully writing + # the request headers if the request has an Expect header. 0 to send the + # request body immediately. + # CLI flag: -overrides.s3.expect-continue-timeout + [expect_continue_timeout: | default = 1s] + + # Maximum number of idle connections across all hosts. 0 means no limit. + # CLI flag: -overrides.s3.max-idle-conns + [max_idle_conns: | default = 100] + + # Maximum number of idle connections per host. 0 means no limit. + # CLI flag: -overrides.s3.max-idle-conns-per-host + [max_idle_conns_per_host: | default = 100] + +gcs: + # GCS bucket name + # CLI flag: -overrides.gcs.bucket-name + [bucket_name: | default = ""] + + # JSON either from a file or inline. + # CLI flag: -overrides.gcs.service-account + [service_account: | default = ""] + +azure: + # Azure storage account name + # CLI flag: -overrides.azure.account-name + [account_name: | default = ""] + + # Azure storage account key + # CLI flag: -overrides.azure.account-key + [account_key: | default = ""] + + # Azure storage container name + # CLI flag: -overrides.azure.container-name + [container_name: | default = ""] + + # Azure storage endpoint suffix without schema. The account name will be + # prefixed to this value to create the FQDN. If set to empty string, default + # endpoint suffix will be used. + # CLI flag: -overrides.azure.endpoint-suffix + [endpoint_suffix: | default = ""] + + # Azure storage max retry attempts + # CLI flag: -overrides.azure.max-retries + [max_retries: | default = 20] + + # Azure storage user domain + # CLI flag: -overrides.azure.user-domain + [user_domain: | default = ""] + + # Azure storage tenant ID + # CLI flag: -overrides.azure.tenant-id + [tenant_id: | default = ""] + + # Azure storage client ID + # CLI flag: -overrides.azure.client-id + [client_id: | default = ""] + + # Azure storage client secret + # CLI flag: -overrides.azure.client-secret + [client_secret: | default = ""] + + # Azure storage subscription ID + # CLI flag: -overrides.azure.subscription-id + [subscription_id: | default = ""] + + # Azure storage environment + # CLI flag: -overrides.azure.environment + [environment: | default = "AzurePublicCloud"] + + # Azure storage max retry attempts + # CLI flag: -overrides.azure.max-retries + [max_retries: | default = 20] + + # The time an idle connection will remain idle before closing. + # CLI flag: -overrides.azure.idle-conn-timeout + [idle_conn_timeout: | default = 1m30s] + + # The amount of time the client will wait for a servers response headers. + # CLI flag: -overrides.azure.response-header-timeout + [response_header_timeout: | default = 2m] + + # Maximum time to wait for a TLS handshake. 0 means no limit. + # CLI flag: -overrides.azure.tls-handshake-timeout + [tls_handshake_timeout: | default = 10s] + + # The time to wait for a server's first response headers after fully writing + # the request headers if the request has an Expect header. 0 to send the + # request body immediately. + # CLI flag: -overrides.azure.expect-continue-timeout + [expect_continue_timeout: | default = 1s] + + # Maximum number of idle connections across all hosts. 0 means no limit. + # CLI flag: -overrides.azure.max-idle-conns + [max_idle_conns: | default = 100] + + # Maximum number of idle connections per host. 0 means no limit. + # CLI flag: -overrides.azure.max-idle-conns-per-host + [max_idle_conns_per_host: | default = 100] + +swift: + # OpenStack Swift authentication API version. 0 to autodetect. + # CLI flag: -overrides.swift.auth-version + [auth_version: | default = 0] + + # OpenStack Swift authentication URL + # CLI flag: -overrides.swift.auth-url + [auth_url: | default = ""] + + # OpenStack Swift username + # CLI flag: -overrides.swift.username + [username: | default = ""] + + # OpenStack Swift user's domain name + # CLI flag: -overrides.swift.user-domain-name + [user_domain_name: | default = ""] + + # OpenStack Swift user's domain ID + # CLI flag: -overrides.swift.user-domain-id + [user_domain_id: | default = ""] + + # OpenStack Swift user ID + # CLI flag: -overrides.swift.user-id + [user_id: | default = ""] + + # OpenStack Swift user's password + # CLI flag: -overrides.swift.password + [password: | default = ""] + + # OpenStack Swift user's domain ID + # CLI flag: -overrides.swift.domain-id + [domain_id: | default = ""] + + # OpenStack Swift domain name + # CLI flag: -overrides.swift.domain-name + [domain_name: | default = ""] + + # OpenStack Swift project ID + # CLI flag: -overrides.swift.project-id + [project_id: | default = ""] + + # OpenStack Swift project name + # CLI flag: -overrides.swift.project-name + [project_name: | default = ""] + + # OpenStack Swift project domain ID + # CLI flag: -overrides.swift.project-domain-id + [project_domain_id: | default = ""] + + # OpenStack Swift project domain name + # CLI flag: -overrides.swift.project-domain-name + [project_domain_name: | default = ""] + + # OpenStack Swift region name + # CLI flag: -overrides.swift.region-name + [region_name: | default = ""] + + # OpenStack Swift container name + # CLI flag: -overrides.swift.container-name + [container_name: | default = ""] + + # OpenStack Swift max retry attempts + # CLI flag: -overrides.swift.max-retries + [max_retries: | default = 3] + + # OpenStack Swift connect timeout + # CLI flag: -overrides.swift.connect-timeout + [connect_timeout: | default = 10s] + + # OpenStack Swift request timeout + # CLI flag: -overrides.swift.request-timeout + [request_timeout: | default = 5s] +``` +``` + ### `ruler_storage_config` The `ruler_storage_config` configures the Cortex ruler storage backend. diff --git a/integration/overrides_test.go b/integration/overrides_test.go new file mode 100644 index 00000000000..4c68d9b8f42 --- /dev/null +++ b/integration/overrides_test.go @@ -0,0 +1,323 @@ +//go:build integration +// +build integration + +package integration + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/thanos-io/objstore/providers/s3" + "gopkg.in/yaml.v3" + + "github.com/cortexproject/cortex/integration/e2e" + e2edb "github.com/cortexproject/cortex/integration/e2e/db" + "github.com/cortexproject/cortex/integration/e2ecortex" +) + +func TestOverridesAPIWithRunningCortex(t *testing.T) { + s, err := e2e.NewScenario(networkName) + require.NoError(t, err) + defer s.Close() + + consul := e2edb.NewConsulWithName("consul") + require.NoError(t, s.StartAndWaitReady(consul)) + + minio := e2edb.NewMinio(9000, "cortex") + require.NoError(t, s.StartAndWaitReady(minio)) + + runtimeConfig := map[string]interface{}{ + "overrides": map[string]interface{}{ + "user1": map[string]interface{}{ + "ingestion_rate": 5000, + }, + }, + } + runtimeConfigData, err := yaml.Marshal(runtimeConfig) + require.NoError(t, err) + + s3Client, err := s3.NewBucketWithConfig(nil, s3.Config{ + Endpoint: minio.HTTPEndpoint(), + Insecure: true, + Bucket: "cortex", + AccessKey: e2edb.MinioAccessKey, + SecretKey: e2edb.MinioSecretKey, + }, "overrides-test", nil) + require.NoError(t, err) + + require.NoError(t, s3Client.Upload(context.Background(), "runtime.yaml", bytes.NewReader(runtimeConfigData))) + + baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) + flags := mergeFlags( + baseFlags, + map[string]string{ + "-target": "overrides", + "-overrides.enabled": "true", + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "s3", + "-overrides.s3.access-key-id": e2edb.MinioAccessKey, + "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, + "-overrides.s3.bucket-name": "cortex", + "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-overrides.s3.insecure": "true", + "-ring.store": "consul", + "-consul.hostname": consul.NetworkHTTPEndpoint(), + }, + ) + + cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides", flags, "") + require.NoError(t, s.StartAndWaitReady(cortexSvc)) + + t.Run("GET overrides for existing user", func(t *testing.T) { + req, err := http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user1") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var overrides map[string]interface{} + err = json.NewDecoder(resp.Body).Decode(&overrides) + require.NoError(t, err) + + assert.Equal(t, float64(5000), overrides["ingestion_rate"]) + }) + + t.Run("GET overrides for non-existing user", func(t *testing.T) { + req, err := http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user2") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var overrides map[string]interface{} + err = json.NewDecoder(resp.Body).Decode(&overrides) + require.NoError(t, err) + + assert.Empty(t, overrides) + }) + + t.Run("PUT overrides for new user", func(t *testing.T) { + newOverrides := map[string]interface{}{ + "ingestion_rate": 6000, + "ingestion_burst_size": 7000, + } + requestBody, err := json.Marshal(newOverrides) + require.NoError(t, err) + + req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user3") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + req, err = http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user3") + + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var savedOverrides map[string]interface{} + err = json.NewDecoder(resp.Body).Decode(&savedOverrides) + require.NoError(t, err) + + assert.Equal(t, float64(6000), savedOverrides["ingestion_rate"]) + assert.Equal(t, float64(7000), savedOverrides["ingestion_burst_size"]) + }) + + t.Run("PUT overrides with invalid limit", func(t *testing.T) { + invalidOverrides := map[string]interface{}{ + "invalid_limit": 5000, + } + requestBody, err := json.Marshal(invalidOverrides) + require.NoError(t, err) + + req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user4") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + t.Run("PUT overrides with invalid JSON", func(t *testing.T) { + req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader([]byte("invalid json"))) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user5") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + t.Run("DELETE overrides", func(t *testing.T) { + req, err := http.NewRequest("DELETE", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user1") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + req, err = http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user1") + + resp, err = http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var overrides map[string]interface{} + err = json.NewDecoder(resp.Body).Decode(&overrides) + require.NoError(t, err) + + assert.Empty(t, overrides) + }) + + require.NoError(t, s.Stop(cortexSvc)) +} + +func TestOverridesAPITenantExtraction(t *testing.T) { + s, err := e2e.NewScenario(networkName) + require.NoError(t, err) + defer s.Close() + + consul := e2edb.NewConsulWithName("consul-tenant") + require.NoError(t, s.StartAndWaitReady(consul)) + + minio := e2edb.NewMinio(9010, "cortex") + require.NoError(t, s.StartAndWaitReady(minio)) + + baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) + flags := mergeFlags( + baseFlags, + map[string]string{ + "-target": "overrides", + "-overrides.enabled": "true", + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "s3", + "-overrides.s3.access-key-id": e2edb.MinioAccessKey, + "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, + "-overrides.s3.bucket-name": "cortex", + "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-overrides.s3.insecure": "true", + "-ring.store": "consul", + "-consul.hostname": consul.NetworkHTTPEndpoint(), + }, + ) + + cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-tenant", flags, "") + require.NoError(t, s.StartAndWaitReady(cortexSvc)) + + t.Run("no tenant header", func(t *testing.T) { + req, err := http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusUnauthorized, resp.StatusCode) + }) + + t.Run("empty tenant header", func(t *testing.T) { + req, err := http.NewRequest("GET", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", nil) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusUnauthorized, resp.StatusCode) + }) + + require.NoError(t, s.Stop(cortexSvc)) +} + +func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { + s, err := e2e.NewScenario(networkName) + require.NoError(t, err) + defer s.Close() + + t.Run("filesystem backend should be rejected", func(t *testing.T) { + baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) + flags := mergeFlags( + baseFlags, + map[string]string{ + "-target": "overrides", + "-overrides.enabled": "true", + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "filesystem", + "-ring.store": "consul", + "-consul.hostname": "localhost:8500", + }, + ) + + cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-filesystem", flags, "") + + err = s.StartAndWaitReady(cortexSvc) + if err == nil { + t.Error("Expected Cortex to fail to start with filesystem backend, but it started successfully") + require.NoError(t, s.Stop(cortexSvc)) + } else { + t.Logf("Expected failure with filesystem backend: %v", err) + } + }) + + t.Run("no backend specified should be rejected", func(t *testing.T) { + baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) + flags := mergeFlags( + baseFlags, + map[string]string{ + "-target": "overrides", + "-overrides.enabled": "true", + "-overrides.runtime-config-file": "runtime.yaml", + "-ring.store": "consul", + "-consul.hostname": "localhost:8500", + }, + ) + + cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-no-backend", flags, "") + + err = s.StartAndWaitReady(cortexSvc) + if err == nil { + t.Error("Expected Cortex to fail to start with no backend specified, but it started successfully") + require.NoError(t, s.Stop(cortexSvc)) + } else { + t.Logf("Expected failure with no backend specified: %v", err) + } + }) +} diff --git a/pkg/api/api.go b/pkg/api/api.go index 1339caeff28..e0887d6b22e 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -30,6 +30,7 @@ import ( frontendv2 "github.com/cortexproject/cortex/pkg/frontend/v2" "github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb" "github.com/cortexproject/cortex/pkg/ingester/client" + "github.com/cortexproject/cortex/pkg/overrides" "github.com/cortexproject/cortex/pkg/purger" "github.com/cortexproject/cortex/pkg/querier" "github.com/cortexproject/cortex/pkg/ring" @@ -385,6 +386,17 @@ func (a *API) RegisterRulerAPI(r *ruler.API) { a.RegisterRoute(path.Join(a.cfg.LegacyHTTPPrefix, "/rules/{namespace}"), http.HandlerFunc(r.DeleteNamespace), true, "DELETE") } +// RegisterOverrides registers routes associated with the Overrides API +func (a *API) RegisterOverrides(o *overrides.API) { + // Register individual overrides API routes with the main API + a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.GetOverrides), true, "GET") + a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.SetOverrides), true, "PUT") + a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.DeleteOverrides), true, "DELETE") + + // Add link to the index page + a.indexPage.AddLink(SectionAdminEndpoints, "/api/v1/user-overrides", "User Overrides API") +} + // RegisterRing registers the ring UI page associated with the distributor for writes. func (a *API) RegisterRing(r *ring.Ring) { a.indexPage.AddLink(SectionAdminEndpoints, "/ingester/ring", "Ingester Ring Status") diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go index 0c6fd01baa1..6de1e11cbc9 100644 --- a/pkg/cortex/cortex.go +++ b/pkg/cortex/cortex.go @@ -40,6 +40,7 @@ import ( frontendv1 "github.com/cortexproject/cortex/pkg/frontend/v1" "github.com/cortexproject/cortex/pkg/ingester" "github.com/cortexproject/cortex/pkg/ingester/client" + "github.com/cortexproject/cortex/pkg/overrides" "github.com/cortexproject/cortex/pkg/parquetconverter" "github.com/cortexproject/cortex/pkg/querier" "github.com/cortexproject/cortex/pkg/querier/tenantfederation" @@ -126,6 +127,7 @@ type Config struct { RuntimeConfig runtimeconfig.Config `yaml:"runtime_config"` MemberlistKV memberlist.KVConfig `yaml:"memberlist"` QueryScheduler scheduler.Config `yaml:"query_scheduler"` + Overrides overrides.Config `yaml:"overrides"` Tracing tracing.Config `yaml:"tracing"` } @@ -175,6 +177,7 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { c.RuntimeConfig.RegisterFlags(f) c.MemberlistKV.RegisterFlags(f) c.QueryScheduler.RegisterFlags(f) + c.Overrides.RegisterFlags(f) c.Tracing.RegisterFlags(f) } @@ -314,6 +317,7 @@ type Cortex struct { Ring *ring.Ring TenantLimits validation.TenantLimits OverridesConfig *validation.Overrides + Overrides *overrides.API Distributor *distributor.Distributor Ingester *ingester.Ingester Flusher *flusher.Flusher diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 46a13cd42cd..d86e61efa5f 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -36,6 +36,7 @@ import ( "github.com/cortexproject/cortex/pkg/frontend" "github.com/cortexproject/cortex/pkg/frontend/transport" "github.com/cortexproject/cortex/pkg/ingester" + "github.com/cortexproject/cortex/pkg/overrides" "github.com/cortexproject/cortex/pkg/parquetconverter" "github.com/cortexproject/cortex/pkg/purger" "github.com/cortexproject/cortex/pkg/querier" @@ -68,6 +69,7 @@ const ( Ring string = "ring" RuntimeConfig string = "runtime-config" OverridesConfig string = "overrides-config" + Overrides string = "overrides" OverridesExporter string = "overrides-exporter" Server string = "server" Distributor string = "distributor" @@ -207,6 +209,18 @@ func (t *Cortex) initOverridesConfig() (services.Service, error) { return nil, nil } +func (t *Cortex) initOverrides() (services.Service, error) { + overridesAPI, err := overrides.New(t.Cfg.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + if err != nil { + return nil, fmt.Errorf("failed to create overrides API: %w", err) + } + t.Overrides = overridesAPI + + t.API.RegisterOverrides(overridesAPI) + + return overridesAPI, nil +} + func (t *Cortex) initOverridesExporter() (services.Service, error) { if t.Cfg.isModuleEnabled(OverridesExporter) && t.TenantLimits == nil { // This target isn't enabled by default ("all") and requires per-tenant limits to @@ -858,6 +872,7 @@ func (t *Cortex) setupModuleManager() error { mm.RegisterModule(MemberlistKV, t.initMemberlistKV, modules.UserInvisibleModule) mm.RegisterModule(Ring, t.initRing, modules.UserInvisibleModule) mm.RegisterModule(OverridesConfig, t.initOverridesConfig, modules.UserInvisibleModule) + mm.RegisterModule(Overrides, t.initOverrides) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) @@ -890,6 +905,7 @@ func (t *Cortex) setupModuleManager() error { RuntimeConfig: {API}, Ring: {API, RuntimeConfig, MemberlistKV}, OverridesConfig: {RuntimeConfig}, + Overrides: {API, OverridesConfig}, OverridesExporter: {RuntimeConfig}, Distributor: {DistributorService, API, GrpcClientService}, DistributorService: {Ring, OverridesConfig}, @@ -912,7 +928,7 @@ func (t *Cortex) setupModuleManager() error { TenantDeletion: {API, OverridesConfig}, Purger: {TenantDeletion}, TenantFederation: {Queryable}, - All: {QueryFrontend, Querier, Ingester, Distributor, Purger, StoreGateway, Ruler, Compactor, AlertManager}, + All: {QueryFrontend, Querier, Ingester, Distributor, Purger, StoreGateway, Ruler, Compactor, AlertManager, Overrides}, } if t.Cfg.ExternalPusher != nil && t.Cfg.ExternalQueryable != nil { deps[Ruler] = []string{OverridesConfig, RulerStorage} diff --git a/pkg/overrides/api.go b/pkg/overrides/api.go new file mode 100644 index 00000000000..927057df25a --- /dev/null +++ b/pkg/overrides/api.go @@ -0,0 +1,179 @@ +package overrides + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + + "github.com/go-kit/log/level" + "gopkg.in/yaml.v3" + + "github.com/cortexproject/cortex/pkg/tenant" +) + +const ( + // HTTP status codes + StatusOK = 200 + StatusBadRequest = 400 + StatusUnauthorized = 401 + StatusInternalServerError = 500 + + // Error messages + ErrInvalidJSON = "Invalid JSON" + + // Runtime config errors + ErrRuntimeConfig = "runtime config read error" +) + +type RuntimeConfigFile struct { + Overrides map[string]map[string]interface{} `yaml:"overrides"` + HardOverrides map[string]map[string]interface{} `yaml:"hard_overrides"` +} + +// GetOverrides retrieves overrides for a specific tenant +func (a *API) GetOverrides(w http.ResponseWriter, r *http.Request) { + userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) + if err != nil { + http.Error(w, err.Error(), StatusUnauthorized) + return + } + + // Read overrides from bucket storage + overrides, err := a.getOverridesFromBucket(r.Context(), userID) + if err != nil { + http.Error(w, err.Error(), StatusInternalServerError) + return + } + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(overrides); err != nil { + level.Error(a.logger).Log("msg", "failed to encode overrides response", "err", err) + http.Error(w, "Internal server error", http.StatusInternalServerError) + return + } +} + +// SetOverrides updates overrides for a specific tenant +func (a *API) SetOverrides(w http.ResponseWriter, r *http.Request) { + userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) + if err != nil { + http.Error(w, err.Error(), StatusUnauthorized) + return + } + + var overrides map[string]interface{} + if err := json.NewDecoder(r.Body).Decode(&overrides); err != nil { + http.Error(w, ErrInvalidJSON, StatusBadRequest) + return + } + + // Validate that only allowed limits are being changed + if err := ValidateOverrides(overrides); err != nil { + http.Error(w, err.Error(), StatusBadRequest) + return + } + + // Validate that values don't exceed hard limits from runtime config + if err := a.validateHardLimits(overrides, userID); err != nil { + http.Error(w, err.Error(), StatusBadRequest) + return + } + + // Write overrides to bucket storage + if err := a.setOverridesToBucket(r.Context(), userID, overrides); err != nil { + http.Error(w, err.Error(), StatusInternalServerError) + return + } + + w.WriteHeader(StatusOK) +} + +// DeleteOverrides removes tenant-specific overrides +func (a *API) DeleteOverrides(w http.ResponseWriter, r *http.Request) { + userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) + if err != nil { + http.Error(w, err.Error(), StatusUnauthorized) + return + } + + if err := a.deleteOverridesFromBucket(r.Context(), userID); err != nil { + http.Error(w, err.Error(), StatusInternalServerError) + return + } + + w.WriteHeader(StatusOK) +} + +// getOverridesFromBucket reads overrides for a specific tenant from the runtime config file +func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[string]interface{}, error) { + reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) + if err != nil { + return map[string]interface{}{}, nil + } + defer reader.Close() + + var config RuntimeConfigFile + if err := yaml.NewDecoder(reader).Decode(&config); err != nil { + return nil, fmt.Errorf("%s: %w", ErrRuntimeConfig, err) + } + + if config.Overrides != nil { + if tenantOverrides, exists := config.Overrides[userID]; exists { + return tenantOverrides, nil + } + } + + return map[string]interface{}{}, nil +} + +// setOverridesToBucket writes overrides for a specific tenant to the runtime config file +func (a *API) setOverridesToBucket(ctx context.Context, userID string, overrides map[string]interface{}) error { + var config RuntimeConfigFile + reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) + if err == nil { + defer reader.Close() + if err := yaml.NewDecoder(reader).Decode(&config); err != nil { + return fmt.Errorf("%s: %w", ErrRuntimeConfig, err) + } + } + + if config.Overrides == nil { + config.Overrides = make(map[string]map[string]interface{}) + } + + config.Overrides[userID] = overrides + + data, err := yaml.Marshal(config) + if err != nil { + return fmt.Errorf("%s: %w", ErrRuntimeConfig, err) + } + + return a.bucketClient.Upload(ctx, a.runtimeConfigPath, bytes.NewReader(data)) +} + +// deleteOverridesFromBucket removes overrides for a specific tenant from the runtime config file +func (a *API) deleteOverridesFromBucket(ctx context.Context, userID string) error { + reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) + if err != nil { + return nil + } + defer reader.Close() + + var config RuntimeConfigFile + if err := yaml.NewDecoder(reader).Decode(&config); err != nil { + return fmt.Errorf("%s: %w", ErrRuntimeConfig, err) + } + + if config.Overrides != nil { + delete(config.Overrides, userID) + } + + data, err := yaml.Marshal(config) + if err != nil { + return fmt.Errorf("%s: %w", ErrRuntimeConfig, err) + } + + return a.bucketClient.Upload(ctx, a.runtimeConfigPath, bytes.NewReader(data)) +} diff --git a/pkg/overrides/limits.go b/pkg/overrides/limits.go new file mode 100644 index 00000000000..14151d9319e --- /dev/null +++ b/pkg/overrides/limits.go @@ -0,0 +1,132 @@ +package overrides + +import ( + "context" + "fmt" + "strconv" + "strings" + + "gopkg.in/yaml.v3" +) + +const ( + // Error messages + ErrInvalidLimits = "the following limits cannot be modified via the overrides API" +) + +// AllowedLimits defines the limits that can be modified via the overrides API +var AllowedLimits = []string{ + "max_global_series_per_user", + "max_global_series_per_metric", + "ingestion_rate", + "ingestion_burst_size", + "ruler_max_rules_per_rule_group", + "ruler_max_rule_groups_per_tenant", +} + +// ValidateOverrides checks if the provided overrides only contain allowed limits +func ValidateOverrides(overrides map[string]interface{}) error { + var invalidLimits []string + + for limitName := range overrides { + if !IsLimitAllowed(limitName) { + invalidLimits = append(invalidLimits, limitName) + } + } + + if len(invalidLimits) > 0 { + return fmt.Errorf("%s: %s", ErrInvalidLimits, strings.Join(invalidLimits, ", ")) + } + + return nil +} + +// GetAllowedLimits returns a list of all allowed limit names +func GetAllowedLimits() []string { + return AllowedLimits +} + +// IsLimitAllowed checks if a specific limit can be modified +func IsLimitAllowed(limitName string) bool { + for _, allowed := range AllowedLimits { + if allowed == limitName { + return true + } + } + return false +} + +// validateHardLimits checks if the provided overrides exceed any hard limits from the runtime config +func (a *API) validateHardLimits(overrides map[string]interface{}, userID string) error { + // Read the runtime config to get hard limits + reader, err := a.bucketClient.Get(context.Background(), a.runtimeConfigPath) + if err != nil { + // If we can't read the config, skip hard limit validation + return nil + } + defer reader.Close() + + var config RuntimeConfigFile + if err := yaml.NewDecoder(reader).Decode(&config); err != nil { + // If we can't decode the config, skip hard limit validation + return nil + } + + // If no hard overrides are defined, skip validation + if config.HardOverrides == nil { + return nil + } + + // Get hard limits for this specific user + userHardLimits, exists := config.HardOverrides[userID] + if !exists { + return nil // No hard limits defined for this user + } + + // Validate each override against the user's hard limits + for limitName, value := range overrides { + if hardLimit, exists := userHardLimits[limitName]; exists { + if err := a.validateSingleHardLimit(limitName, value, hardLimit); err != nil { + return err + } + } + } + + return nil +} + +// validateSingleHardLimit validates a single limit against its hard limit +func (a *API) validateSingleHardLimit(limitName string, value, hardLimit interface{}) error { + // Convert both values to float64 for comparison + valueFloat, err := convertToFloat64(value) + if err != nil { + return nil // Skip validation for unparseable values + } + + hardLimitFloat, err := convertToFloat64(hardLimit) + if err != nil { + return nil // Skip validation for unparseable hard limits + } + + if valueFloat > hardLimitFloat { + return fmt.Errorf("limit %s exceeds hard limit: %f > %f", limitName, valueFloat, hardLimitFloat) + } + + return nil +} + +// convertToFloat64 converts any value to float64 +func convertToFloat64(v interface{}) (float64, error) { + switch val := v.(type) { + case float64: + return val, nil + case int: + return float64(val), nil + case int64: + return float64(val), nil + case string: + return strconv.ParseFloat(val, 64) + default: + return 0, fmt.Errorf("unsupported type: %T", v) + } +} diff --git a/pkg/overrides/overrides.go b/pkg/overrides/overrides.go new file mode 100644 index 00000000000..824a404c830 --- /dev/null +++ b/pkg/overrides/overrides.go @@ -0,0 +1,204 @@ +package overrides + +import ( + "context" + "errors" + "flag" + "fmt" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" + "github.com/thanos-io/objstore" + + "github.com/cortexproject/cortex/pkg/storage/bucket" + "github.com/cortexproject/cortex/pkg/util/services" +) + +const ( + // Error messages + ErrNoStorageBackendSpecified = "overrides module requires a storage backend to be specified" + ErrFilesystemBackendNotSupported = "filesystem backend is not supported for overrides module; use S3, GCS, Azure, or Swift instead" + ErrInvalidBucketConfiguration = "invalid bucket configuration" + ErrInvalidOverridesConfiguration = "invalid overrides configuration" + ErrFailedToCreateBucketClient = "failed to create bucket client for overrides" +) + +// Config holds configuration for the overrides module +type Config struct { + // Enable the overrides API module + // CLI flag: -overrides.enabled + Enabled bool `yaml:"enabled"` + + // Path to the runtime configuration file that can be updated via the overrides API + // CLI flag: -overrides.runtime-config-file + RuntimeConfigFile string `yaml:"runtime_config_file"` + + // Storage configuration for the runtime config file + // All bucket backends (S3, GCS, Azure, Swift) are supported, but not filesystem + bucket.Config `yaml:",inline"` +} + +// RegisterFlags registers the overrides module flags +func (c *Config) RegisterFlags(f *flag.FlagSet) { + f.BoolVar(&c.Enabled, "overrides.enabled", false, "Enable the overrides API module") + f.StringVar(&c.RuntimeConfigFile, "overrides.runtime-config-file", "runtime.yaml", "Path to the runtime configuration file that can be updated via the overrides API") + + c.RegisterFlagsWithPrefix("overrides.", f) +} + +// Validate validates the configuration and returns an error if validation fails +func (c *Config) Validate() error { + if !c.Enabled { + return nil + } + + if c.RuntimeConfigFile == "" { + c.RuntimeConfigFile = "runtime.yaml" + } + + if c.Backend == "" { + c.Backend = bucket.S3 + } + + if c.Backend == bucket.Filesystem { + return errors.New(ErrFilesystemBackendNotSupported) + } + + if c.Backend == bucket.S3 { + if c.S3.SignatureVersion == "" { + c.S3.SignatureVersion = "v4" + } + if c.S3.BucketLookupType == "" { + c.S3.BucketLookupType = "auto" + } + if !c.S3.SendContentMd5 { + c.S3.SendContentMd5 = true + } + if c.S3.HTTP.IdleConnTimeout == 0 { + c.S3.HTTP.IdleConnTimeout = 90 * time.Second + } + if c.S3.HTTP.ResponseHeaderTimeout == 0 { + c.S3.HTTP.ResponseHeaderTimeout = 2 * time.Minute + } + if c.S3.HTTP.TLSHandshakeTimeout == 0 { + c.S3.HTTP.TLSHandshakeTimeout = 10 * time.Second + } + if c.S3.HTTP.ExpectContinueTimeout == 0 { + c.S3.HTTP.ExpectContinueTimeout = 1 * time.Second + } + if c.S3.HTTP.MaxIdleConns == 0 { + c.S3.HTTP.MaxIdleConns = 100 + } + if c.S3.HTTP.MaxIdleConnsPerHost == 0 { + c.S3.HTTP.MaxIdleConnsPerHost = 100 + } + } + + if c.Backend == bucket.Azure { + if c.Azure.MaxRetries == 0 { + c.Azure.MaxRetries = 20 + } + if c.Azure.IdleConnTimeout == 0 { + c.Azure.IdleConnTimeout = 90 * time.Second + } + if c.Azure.ResponseHeaderTimeout == 0 { + c.Azure.ResponseHeaderTimeout = 2 * time.Minute + } + if c.Azure.TLSHandshakeTimeout == 0 { + c.Azure.TLSHandshakeTimeout = 10 * time.Second + } + if c.Azure.ExpectContinueTimeout == 0 { + c.Azure.ExpectContinueTimeout = 1 * time.Second + } + if c.Azure.MaxIdleConns == 0 { + c.Azure.MaxIdleConns = 100 + } + if c.Azure.MaxIdleConnsPerHost == 0 { + c.Azure.MaxIdleConnsPerHost = 100 + } + } + + if c.Backend == bucket.Swift { + if c.Swift.AuthVersion == 0 { + c.Swift.AuthVersion = 0 + } + if c.Swift.MaxRetries == 0 { + c.Swift.MaxRetries = 3 + } + if c.Swift.ConnectTimeout == 0 { + c.Swift.ConnectTimeout = 10 * time.Second + } + if c.Swift.RequestTimeout == 0 { + c.Swift.RequestTimeout = 5 * time.Second + } + } + + if err := c.Config.Validate(); err != nil { + return fmt.Errorf("%s: %w", ErrInvalidBucketConfiguration, err) + } + + return nil +} + +// API represents the overrides API module +type API struct { + services.Service + cfg Config + logger log.Logger + registerer prometheus.Registerer + bucketClient objstore.Bucket + runtimeConfigPath string +} + +// New creates a new overrides API instance +func New(cfg Config, logger log.Logger, registerer prometheus.Registerer) (*API, error) { + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("%s: %w", ErrInvalidOverridesConfiguration, err) + } + + api := &API{ + cfg: cfg, + logger: logger, + registerer: registerer, + } + api.Service = services.NewBasicService(api.starting, api.running, api.stopping) + return api, nil +} + +func (a *API) starting(ctx context.Context) error { + level.Info(a.logger).Log("msg", "overrides API starting", "runtime_config_file", a.cfg.RuntimeConfigFile, "backend", a.cfg.Backend) + + bucketClient, err := bucket.NewClient(ctx, a.cfg.Config, nil, "overrides", a.logger, a.registerer) + if err != nil { + level.Error(a.logger).Log("msg", ErrFailedToCreateBucketClient, "err", err) + return fmt.Errorf("%s: %w", ErrFailedToCreateBucketClient, err) + } + a.bucketClient = bucketClient + + a.runtimeConfigPath = a.cfg.RuntimeConfigFile + + level.Info(a.logger).Log("msg", "overrides API started successfully", "backend", a.cfg.Backend) + return nil +} + +func (a *API) running(ctx context.Context) error { + level.Info(a.logger).Log("msg", "overrides API is now running and ready to handle requests") + + <-ctx.Done() + + level.Info(a.logger).Log("msg", "overrides API received shutdown signal") + return nil +} + +func (a *API) stopping(err error) error { + if err != nil { + level.Error(a.logger).Log("msg", "overrides API stopping due to error", "err", err) + } else { + level.Info(a.logger).Log("msg", "overrides API stopping gracefully") + } + + level.Info(a.logger).Log("msg", "overrides API stopped") + return nil +} diff --git a/pkg/overrides/overrides_test.go b/pkg/overrides/overrides_test.go new file mode 100644 index 00000000000..f7942d5a715 --- /dev/null +++ b/pkg/overrides/overrides_test.go @@ -0,0 +1,556 @@ +package overrides + +import ( + "bytes" + "encoding/json" + "flag" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/cortexproject/cortex/pkg/storage/bucket" + "github.com/cortexproject/cortex/pkg/storage/bucket/s3" + "github.com/cortexproject/cortex/pkg/util/flagext" + "github.com/cortexproject/cortex/pkg/util/log" + "github.com/cortexproject/cortex/pkg/util/services" +) + +func TestConfig_Validate(t *testing.T) { + t.Parallel() + tests := map[string]struct { + initConfig func(*Config) + expected error + }{ + "default config should pass": { + initConfig: func(_ *Config) {}, + expected: nil, + }, + "disabled config should pass": { + initConfig: func(cfg *Config) { + cfg.Enabled = false + }, + expected: nil, + }, + "enabled config should pass": { + initConfig: func(cfg *Config) { + cfg.Enabled = true + cfg.Config = bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + } + }, + expected: nil, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + t.Parallel() + cfg := Config{} + + testData.initConfig(&cfg) + + if testData.expected == nil { + assert.NoError(t, cfg.Validate()) + } else { + assert.ErrorIs(t, cfg.Validate(), testData.expected) + } + }) + } +} + +func TestConfig_RegisterFlags(t *testing.T) { + cfg := Config{} + + // Test that flags are registered without panicking + require.NotPanics(t, func() { + flagSet := flag.NewFlagSet("test", flag.PanicOnError) + cfg.RegisterFlags(flagSet) + }) +} + +func TestNew(t *testing.T) { + tests := map[string]struct { + cfg Config + expectError bool + }{ + "valid config should create API": { + cfg: Config{ + Enabled: true, + Config: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + }, + }, + expectError: false, + }, + } + + for testName, testData := range tests { + t.Run(testName, func(t *testing.T) { + api, err := New(testData.cfg, log.Logger, prometheus.DefaultRegisterer) + + if testData.expectError { + assert.Error(t, err) + assert.Nil(t, api) + } else { + assert.NoError(t, err) + assert.NotNil(t, api) + // Don't compare the entire config since defaults may modify it + // Just verify the API was created successfully + } + }) + } +} + +func TestOverridesModuleServiceInterface(t *testing.T) { + // Create the API instance with proper configuration + cfg := Config{ + Enabled: true, + Config: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + }, + } + api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) + require.NoError(t, err) + require.NotNil(t, api) + + // Verify it implements the Service interface + require.Implements(t, (*services.Service)(nil), api) + + // Verify initial state + assert.Equal(t, services.New, api.State()) + + // Verify the service has the expected methods + // This is a basic check that the service was properly constructed + assert.NotNil(t, api.Service) +} + +// TestAPIEndpoints tests the actual HTTP API endpoints +func TestAPIEndpoints(t *testing.T) { + tests := []struct { + name string + method string + path string + tenantID string + requestBody interface{} + expectedStatus int + setupMock func(*bucket.ClientMock) + validateResponse func(*testing.T, *httptest.ResponseRecorder) + }{ + { + name: "GET overrides - no tenant ID", + method: "GET", + path: "/api/v1/user-overrides", + tenantID: "", + expectedStatus: http.StatusUnauthorized, + }, + { + name: "GET overrides - valid tenant ID, no overrides", + method: "GET", + path: "/api/v1/user-overrides", + tenantID: "user123", + expectedStatus: http.StatusOK, + setupMock: func(mock *bucket.ClientMock) { + // Mock that no overrides exist by passing empty content + mock.MockGet("runtime.yaml", "overrides:\n", nil) + }, + validateResponse: func(t *testing.T, recorder *httptest.ResponseRecorder) { + var response map[string]interface{} + err := json.Unmarshal(recorder.Body.Bytes(), &response) + require.NoError(t, err) + assert.Empty(t, response) + }, + }, + { + name: "GET overrides - valid tenant ID, with overrides", + method: "GET", + path: "/api/v1/user-overrides", + tenantID: "user456", + expectedStatus: http.StatusOK, + setupMock: func(mock *bucket.ClientMock) { + overridesData := `overrides: + user456: + ingestion_rate: 5000 + max_global_series_per_user: 100000` + mock.MockGet("runtime.yaml", overridesData, nil) + }, + validateResponse: func(t *testing.T, recorder *httptest.ResponseRecorder) { + var response map[string]interface{} + err := json.Unmarshal(recorder.Body.Bytes(), &response) + require.NoError(t, err) + assert.Equal(t, float64(5000), response["ingestion_rate"]) + assert.Equal(t, float64(100000), response["max_global_series_per_user"]) + }, + }, + { + name: "PUT overrides - no tenant ID", + method: "PUT", + path: "/api/v1/user-overrides", + tenantID: "", + requestBody: map[string]interface{}{"ingestion_rate": 5000}, + expectedStatus: http.StatusUnauthorized, + }, + { + name: "PUT overrides - valid tenant ID, valid overrides", + method: "PUT", + path: "/api/v1/user-overrides", + tenantID: "user789", + requestBody: map[string]interface{}{"ingestion_rate": 5000, "ruler_max_rules_per_rule_group": 10}, + expectedStatus: http.StatusOK, + setupMock: func(mock *bucket.ClientMock) { + // First read succeeds, then upload succeeds + mock.MockGet("runtime.yaml", "overrides:\n user789:\n ingestion_rate: 5000\n ruler_max_rules_per_rule_group: 10\n", nil) + mock.MockUpload("runtime.yaml", nil) + }, + }, + { + name: "PUT overrides - invalid limit name", + method: "PUT", + path: "/api/v1/user-overrides", + tenantID: "user999", + requestBody: map[string]interface{}{"invalid_limit": 5000}, + expectedStatus: http.StatusBadRequest, + }, + + { + name: "PUT overrides - invalid JSON", + method: "PUT", + path: "/api/v1/user-overrides", + tenantID: "user999", + requestBody: "invalid json", + expectedStatus: http.StatusBadRequest, + }, + { + name: "PUT overrides - exceeding hard limit from runtime config", + method: "PUT", + path: "/api/v1/user-overrides", + tenantID: "user999", + requestBody: map[string]interface{}{"ingestion_rate": 1500000}, // Exceeds hard limit of 1000000 + expectedStatus: http.StatusBadRequest, + setupMock: func(mock *bucket.ClientMock) { + // Mock runtime config with per-user hard limits + runtimeConfig := `overrides: + user999: + ingestion_rate: 1000 +hard_overrides: + user999: + ingestion_rate: 1000000 + max_global_series_per_user: 5000000` + // Mock both reads: one for validateHardLimits, one for setOverridesToBucket + mock.MockGet("runtime.yaml", runtimeConfig, nil) + mock.MockGet("runtime.yaml", runtimeConfig, nil) + mock.MockUpload("runtime.yaml", nil) + }, + }, + { + name: "DELETE overrides - no tenant ID", + method: "DELETE", + path: "/api/v1/user-overrides", + tenantID: "", + expectedStatus: http.StatusUnauthorized, + }, + { + name: "DELETE overrides - valid tenant ID", + method: "DELETE", + path: "/api/v1/user-overrides", + tenantID: "user123", + expectedStatus: http.StatusOK, + setupMock: func(mock *bucket.ClientMock) { + // First read succeeds, then upload succeeds + mock.MockGet("runtime.yaml", "overrides:\n user123:\n ingestion_rate: 1000", nil) + mock.MockUpload("runtime.yaml", nil) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a mock bucket client + mockBucket := &bucket.ClientMock{} + if tt.setupMock != nil { + tt.setupMock(mockBucket) + } + + // Create the API instance with proper configuration + cfg := Config{ + Enabled: true, + + Config: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + }, + } + api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) + require.NoError(t, err) + require.NotNil(t, api) + + // Manually set the bucket client and runtime config path for testing + api.bucketClient = mockBucket + api.runtimeConfigPath = "runtime.yaml" + + // Create the request + var req *http.Request + if tt.requestBody != nil { + var body []byte + if str, ok := tt.requestBody.(string); ok { + body = []byte(str) + } else { + body, err = json.Marshal(tt.requestBody) + require.NoError(t, err) + } + req = httptest.NewRequest(tt.method, tt.path, bytes.NewReader(body)) + } else { + req = httptest.NewRequest(tt.method, tt.path, nil) + } + + // Add tenant ID header if provided + if tt.tenantID != "" { + req.Header.Set("X-Scope-OrgID", tt.tenantID) + } + + // Create response recorder + recorder := httptest.NewRecorder() + + // Call the appropriate handler based on method + switch tt.method { + case "GET": + api.GetOverrides(recorder, req) + case "PUT": + api.SetOverrides(recorder, req) + case "DELETE": + api.DeleteOverrides(recorder, req) + default: + t.Fatalf("Unsupported method: %s", tt.method) + } + + // Assert status code + assert.Equal(t, tt.expectedStatus, recorder.Code) + + // Validate response if validation function provided + if tt.validateResponse != nil { + tt.validateResponse(t, recorder) + } + }) + } +} + +// TestAPITenantExtraction tests tenant ID extraction from various header formats +func TestAPITenantExtraction(t *testing.T) { + tests := []struct { + name string + headers map[string]string + expectedTenant string + expectError bool + setupMock func(*bucket.ClientMock) + }{ + { + name: "X-Scope-OrgID header", + headers: map[string]string{"X-Scope-OrgID": "tenant1"}, + expectedTenant: "tenant1", + expectError: false, + setupMock: func(mock *bucket.ClientMock) { + // Mock successful get with empty overrides + mock.MockGet("runtime.yaml", "overrides:\n", nil) + }, + }, + { + name: "no tenant header", + headers: map[string]string{}, + expectedTenant: "", + expectError: true, + }, + { + name: "empty tenant header", + headers: map[string]string{"X-Scope-OrgID": ""}, + expectedTenant: "", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a mock bucket client + mockBucket := &bucket.ClientMock{} + if tt.setupMock != nil { + tt.setupMock(mockBucket) + } + + // Create the API instance with proper configuration + cfg := Config{ + Enabled: true, + + Config: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + }, + } + api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) + require.NoError(t, err) + require.NotNil(t, api) + + // Manually set the bucket client and runtime config path for testing + api.bucketClient = mockBucket + api.runtimeConfigPath = "runtime.yaml" + + // Create the request + req := httptest.NewRequest("GET", "/api/v1/user-overrides", nil) + for key, value := range tt.headers { + req.Header.Set(key, value) + } + + // Create response recorder + recorder := httptest.NewRecorder() + + // Call the handler + api.GetOverrides(recorder, req) + + // Assert based on expected behavior + if tt.expectError { + assert.Equal(t, http.StatusUnauthorized, recorder.Code) + } else { + assert.Equal(t, http.StatusOK, recorder.Code) + } + }) + } +} + +// TestAPIBucketErrors tests how the API handles bucket operation errors +func TestAPIBucketErrors(t *testing.T) { + tests := []struct { + name string + method string + tenantID string + setupMock func(*bucket.ClientMock) + expectedStatus int + }{ + { + name: "GET overrides - bucket error treated as not found", + method: "GET", + tenantID: "user123", + setupMock: func(mock *bucket.ClientMock) { + mock.MockGet("runtime.yaml", "", fmt.Errorf("bucket error")) + }, + expectedStatus: http.StatusOK, // Current implementation treats errors as "not found" + }, + { + name: "PUT overrides - bucket upload error", + method: "PUT", + tenantID: "user456", + setupMock: func(mock *bucket.ClientMock) { + // First read succeeds, then upload fails + mock.MockGet("runtime.yaml", "overrides:\n user456:\n ingestion_rate: 1000", nil) + mock.MockUpload("runtime.yaml", fmt.Errorf("upload error")) + }, + expectedStatus: http.StatusInternalServerError, + }, + { + name: "DELETE overrides - bucket delete error", + method: "DELETE", + tenantID: "user789", + setupMock: func(mock *bucket.ClientMock) { + // First read succeeds, then upload fails + mock.MockGet("runtime.yaml", "overrides:\n user789:\n ingestion_rate: 1000", nil) + mock.MockUpload("runtime.yaml", fmt.Errorf("upload error")) + }, + expectedStatus: http.StatusInternalServerError, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a mock bucket client + mockBucket := &bucket.ClientMock{} + tt.setupMock(mockBucket) + + // Create the API instance with proper configuration + cfg := Config{ + Enabled: true, + + Config: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, + }, + } + api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) + require.NoError(t, err) + require.NotNil(t, api) + + // Manually set the bucket client and runtime config path for testing + api.bucketClient = mockBucket + api.runtimeConfigPath = "runtime.yaml" + + // Create the request + var req *http.Request + if tt.method == "PUT" { + requestBody := map[string]interface{}{"ingestion_rate": 5000} + body, err := json.Marshal(requestBody) + require.NoError(t, err) + req = httptest.NewRequest(tt.method, "/api/v1/user-overrides", bytes.NewReader(body)) + } else { + req = httptest.NewRequest(tt.method, "/api/v1/user-overrides", nil) + } + + // Add tenant ID header + req.Header.Set("X-Scope-OrgID", tt.tenantID) + + // Create response recorder + recorder := httptest.NewRecorder() + + // Call the appropriate handler + switch tt.method { + case "GET": + api.GetOverrides(recorder, req) + case "PUT": + api.SetOverrides(recorder, req) + case "DELETE": + api.DeleteOverrides(recorder, req) + } + + // Assert status code + assert.Equal(t, tt.expectedStatus, recorder.Code) + }) + } +} From f97b3192119750fef5fcf1be96db358d804f4b07 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Fri, 15 Aug 2025 22:59:32 +0300 Subject: [PATCH 03/11] fix docs and integration tests Signed-off-by: Bogdan Stancu --- docs/configuration/config-file-reference.md | 546 +++++++++++--------- pkg/cortex/modules.go | 4 + 2 files changed, 299 insertions(+), 251 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index e54fdef0b0f..581e43fe887 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -411,6 +411,300 @@ query_scheduler: # CLI flag: -query-scheduler.grpc-client-config.connect-timeout [connect_timeout: | default = 5s] +overrides: + # Enable the overrides API module + # CLI flag: -overrides.enabled + [enabled: | default = false] + + # Path to the runtime configuration file that can be updated via the overrides + # API + # CLI flag: -overrides.runtime-config-file + [runtime_config_file: | default = "runtime.yaml"] + + # Backend storage to use. Supported backends are: s3, gcs, azure, swift, + # filesystem. + # CLI flag: -overrides.backend + [backend: | default = "s3"] + + s3: + # The S3 bucket endpoint. It could be an AWS S3 endpoint listed at + # https://docs.aws.amazon.com/general/latest/gr/s3.html or the address of an + # S3-compatible service in hostname:port format. + # CLI flag: -overrides.s3.endpoint + [endpoint: | default = ""] + + # S3 region. If unset, the client will issue a S3 GetBucketLocation API call + # to autodetect it. + # CLI flag: -overrides.s3.region + [region: | default = ""] + + # S3 bucket name + # CLI flag: -overrides.s3.bucket-name + [bucket_name: | default = ""] + + # If enabled, S3 endpoint will use the non-dualstack variant. + # CLI flag: -overrides.s3.disable-dualstack + [disable_dualstack: | default = false] + + # S3 secret access key + # CLI flag: -overrides.s3.secret-access-key + [secret_access_key: | default = ""] + + # S3 access key ID + # CLI flag: -overrides.s3.access-key-id + [access_key_id: | default = ""] + + # If enabled, use http:// for the S3 endpoint instead of https://. This + # could be useful in local dev/test environments while using an + # S3-compatible backend storage, like Minio. + # CLI flag: -overrides.s3.insecure + [insecure: | default = false] + + # The signature version to use for authenticating against S3. Supported + # values are: v4, v2. + # CLI flag: -overrides.s3.signature-version + [signature_version: | default = "v4"] + + # The s3 bucket lookup style. Supported values are: auto, virtual-hosted, + # path. + # CLI flag: -overrides.s3.bucket-lookup-type + [bucket_lookup_type: | default = "auto"] + + # If true, attach MD5 checksum when upload objects and S3 uses MD5 checksum + # algorithm to verify the provided digest. If false, use CRC32C algorithm + # instead. + # CLI flag: -overrides.s3.send-content-md5 + [send_content_md5: | default = true] + + # The list api version. Supported values are: v1, v2, and ''. + # CLI flag: -overrides.s3.list-objects-version + [list_objects_version: | default = ""] + + # The s3_sse_config configures the S3 server-side encryption. + # The CLI flags prefix for this block config is: overrides + [sse: ] + + http: + # The time an idle connection will remain idle before closing. + # CLI flag: -overrides.s3.http.idle-conn-timeout + [idle_conn_timeout: | default = 1m30s] + + # The amount of time the client will wait for a servers response headers. + # CLI flag: -overrides.s3.http.response-header-timeout + [response_header_timeout: | default = 2m] + + # If the client connects via HTTPS and this option is enabled, the client + # will accept any certificate and hostname. + # CLI flag: -overrides.s3.http.insecure-skip-verify + [insecure_skip_verify: | default = false] + + # Maximum time to wait for a TLS handshake. 0 means no limit. + # CLI flag: -overrides.s3.tls-handshake-timeout + [tls_handshake_timeout: | default = 10s] + + # The time to wait for a server's first response headers after fully + # writing the request headers if the request has an Expect header. 0 to + # send the request body immediately. + # CLI flag: -overrides.s3.expect-continue-timeout + [expect_continue_timeout: | default = 1s] + + # Maximum number of idle (keep-alive) connections across all hosts. 0 + # means no limit. + # CLI flag: -overrides.s3.max-idle-connections + [max_idle_connections: | default = 100] + + # Maximum number of idle (keep-alive) connections to keep per-host. If 0, + # a built-in default value is used. + # CLI flag: -overrides.s3.max-idle-connections-per-host + [max_idle_connections_per_host: | default = 100] + + # Maximum number of connections per host. 0 means no limit. + # CLI flag: -overrides.s3.max-connections-per-host + [max_connections_per_host: | default = 0] + + gcs: + # GCS bucket name + # CLI flag: -overrides.gcs.bucket-name + [bucket_name: | default = ""] + + # JSON representing either a Google Developers Console + # client_credentials.json file or a Google Developers service account key + # file. If empty, fallback to Google default logic. + # CLI flag: -overrides.gcs.service-account + [service_account: | default = ""] + + azure: + # Azure storage account name + # CLI flag: -overrides.azure.account-name + [account_name: | default = ""] + + # Azure storage account key + # CLI flag: -overrides.azure.account-key + [account_key: | default = ""] + + # The values of `account-name` and `endpoint-suffix` values will not be + # ignored if `connection-string` is set. Use this method over `account-key` + # if you need to authenticate via a SAS token or if you use the Azurite + # emulator. + # CLI flag: -overrides.azure.connection-string + [connection_string: | default = ""] + + # Azure storage container name + # CLI flag: -overrides.azure.container-name + [container_name: | default = ""] + + # Azure storage endpoint suffix without schema. The account name will be + # prefixed to this value to create the FQDN + # CLI flag: -overrides.azure.endpoint-suffix + [endpoint_suffix: | default = ""] + + # Number of retries for recoverable errors + # CLI flag: -overrides.azure.max-retries + [max_retries: | default = 20] + + # Deprecated: Azure storage MSI resource. It will be set automatically by + # Azure SDK. + # CLI flag: -overrides.azure.msi-resource + [msi_resource: | default = ""] + + # Azure storage MSI resource managed identity client Id. If not supplied + # default Azure credential will be used. Set it to empty if you need to + # authenticate via Azure Workload Identity. + # CLI flag: -overrides.azure.user-assigned-id + [user_assigned_id: | default = ""] + + http: + # The time an idle connection will remain idle before closing. + # CLI flag: -overrides.azure.http.idle-conn-timeout + [idle_conn_timeout: | default = 1m30s] + + # The amount of time the client will wait for a servers response headers. + # CLI flag: -overrides.azure.http.response-header-timeout + [response_header_timeout: | default = 2m] + + # If the client connects via HTTPS and this option is enabled, the client + # will accept any certificate and hostname. + # CLI flag: -overrides.azure.http.insecure-skip-verify + [insecure_skip_verify: | default = false] + + # Maximum time to wait for a TLS handshake. 0 means no limit. + # CLI flag: -overrides.azure.tls-handshake-timeout + [tls_handshake_timeout: | default = 10s] + + # The time to wait for a server's first response headers after fully + # writing the request headers if the request has an Expect header. 0 to + # send the request body immediately. + # CLI flag: -overrides.azure.expect-continue-timeout + [expect_continue_timeout: | default = 1s] + + # Maximum number of idle (keep-alive) connections across all hosts. 0 + # means no limit. + # CLI flag: -overrides.azure.max-idle-connections + [max_idle_connections: | default = 100] + + # Maximum number of idle (keep-alive) connections to keep per-host. If 0, + # a built-in default value is used. + # CLI flag: -overrides.azure.max-idle-connections-per-host + [max_idle_connections_per_host: | default = 100] + + # Maximum number of connections per host. 0 means no limit. + # CLI flag: -overrides.azure.max-connections-per-host + [max_connections_per_host: | default = 0] + + swift: + # OpenStack Swift authentication API version. 0 to autodetect. + # CLI flag: -overrides.swift.auth-version + [auth_version: | default = 0] + + # OpenStack Swift authentication URL + # CLI flag: -overrides.swift.auth-url + [auth_url: | default = ""] + + # OpenStack Swift application credential ID. + # CLI flag: -overrides.swift.application-credential-id + [application_credential_id: | default = ""] + + # OpenStack Swift application credential name. + # CLI flag: -overrides.swift.application-credential-name + [application_credential_name: | default = ""] + + # OpenStack Swift application credential secret. + # CLI flag: -overrides.swift.application-credential-secret + [application_credential_secret: | default = ""] + + # OpenStack Swift username. + # CLI flag: -overrides.swift.username + [username: | default = ""] + + # OpenStack Swift user's domain name. + # CLI flag: -overrides.swift.user-domain-name + [user_domain_name: | default = ""] + + # OpenStack Swift user's domain ID. + # CLI flag: -overrides.swift.user-domain-id + [user_domain_id: | default = ""] + + # OpenStack Swift user ID. + # CLI flag: -overrides.swift.user-id + [user_id: | default = ""] + + # OpenStack Swift API key. + # CLI flag: -overrides.swift.password + [password: | default = ""] + + # OpenStack Swift user's domain ID. + # CLI flag: -overrides.swift.domain-id + [domain_id: | default = ""] + + # OpenStack Swift user's domain name. + # CLI flag: -overrides.swift.domain-name + [domain_name: | default = ""] + + # OpenStack Swift project ID (v2,v3 auth only). + # CLI flag: -overrides.swift.project-id + [project_id: | default = ""] + + # OpenStack Swift project name (v2,v3 auth only). + # CLI flag: -overrides.swift.project-name + [project_name: | default = ""] + + # ID of the OpenStack Swift project's domain (v3 auth only), only needed if + # it differs the from user domain. + # CLI flag: -overrides.swift.project-domain-id + [project_domain_id: | default = ""] + + # Name of the OpenStack Swift project's domain (v3 auth only), only needed + # if it differs from the user domain. + # CLI flag: -overrides.swift.project-domain-name + [project_domain_name: | default = ""] + + # OpenStack Swift Region to use (v2,v3 auth only). + # CLI flag: -overrides.swift.region-name + [region_name: | default = ""] + + # Name of the OpenStack Swift container to put chunks in. + # CLI flag: -overrides.swift.container-name + [container_name: | default = ""] + + # Max retries on requests error. + # CLI flag: -overrides.swift.max-retries + [max_retries: | default = 3] + + # Time after which a connection attempt is aborted. + # CLI flag: -overrides.swift.connect-timeout + [connect_timeout: | default = 10s] + + # Time after which an idle request is aborted. The timeout watchdog is reset + # each time some data is received, so the timeout triggers after X time no + # data is received on a request. + # CLI flag: -overrides.swift.request-timeout + [request_timeout: | default = 5s] + + filesystem: + # Local filesystem storage directory. + # CLI flag: -overrides.filesystem.dir + [dir: | default = ""] + # The tracing_config configures backends cortex uses. [tracing: ] ``` @@ -5475,257 +5769,6 @@ thanos_engine: [optimizers: | default = "default"] ``` -### `overrides` - -The `overrides` configures the Cortex overrides API for managing user overrides. - -```yaml -# Enable the overrides module. -# CLI flag: -overrides.enabled -[enabled: | default = false] - -# Path to the runtime configuration file. -# CLI flag: -overrides.runtime-config-file -[runtime_config_file: | default = "runtime.yaml"] - -# Backend storage to use. Supported backends are: s3, gcs, azure, swift. -# CLI flag: -overrides.backend -[backend: | default = "s3"] - -s3: - # The S3 bucket endpoint. It could be an AWS S3 endpoint listed at - # https://docs.aws.amazon.com/general/latest/gr/s3.html or the address of an - # S3-compatible service in hostname:port format. - # CLI flag: -overrides.s3.endpoint - [endpoint: | default = ""] - - # S3 region. If unset, the client will issue a S3 GetBucketLocation API call - # to autodetect it. - # CLI flag: -overrides.s3.region - [region: | default = ""] - - # S3 bucket name - # CLI flag: -overrides.s3.bucket-name - [bucket_name: | default = ""] - - # S3 secret access key - # CLI flag: -overrides.s3.secret-access-key - [secret_access_key: | default = ""] - - # S3 access key ID - # CLI flag: -overrides.s3.access-key-id - [access_key_id: | default = ""] - - # If enabled, use http:// for the S3 endpoint instead of https://. This could - # be useful in local dev/test environments while using an S3-compatible - # backend storage, like Minio. - # CLI flag: -overrides.s3.insecure - [insecure: | default = false] - - # The signature version to use for authenticating against S3. Supported values - # are: v4, v2. - # CLI flag: -overrides.s3.signature-version - [signature_version: | default = "v4"] - - # The s3 bucket lookup style. Supported values are: auto, virtual-hosted, - # path. - # CLI flag: -overrides.s3.bucket-lookup-type - [bucket_lookup_type: | default = "auto"] - - # If true, attach MD5 checksum when upload objects and S3 uses MD5 checksum - # algorithm to verify the provided digest. If false, use CRC32C algorithm - # instead. - # CLI flag: -overrides.s3.send-content-md5 - [send_content_md5: | default = true] - - http: - # The time an idle connection will remain idle before closing. - # CLI flag: -overrides.s3.http.idle-conn-timeout - [idle_conn_timeout: | default = 1m30s] - - # The amount of time the client will wait for a servers response headers. - # CLI flag: -overrides.s3.http.response-header-timeout - [response_header_timeout: | default = 2m] - - # Maximum time to wait for a TLS handshake. 0 means no limit. - # CLI flag: -overrides.s3.tls-handshake-timeout - [tls_handshake_timeout: | default = 10s] - - # The time to wait for a server's first response headers after fully writing - # the request headers if the request has an Expect header. 0 to send the - # request body immediately. - # CLI flag: -overrides.s3.expect-continue-timeout - [expect_continue_timeout: | default = 1s] - - # Maximum number of idle connections across all hosts. 0 means no limit. - # CLI flag: -overrides.s3.max-idle-conns - [max_idle_conns: | default = 100] - - # Maximum number of idle connections per host. 0 means no limit. - # CLI flag: -overrides.s3.max-idle-conns-per-host - [max_idle_conns_per_host: | default = 100] - -gcs: - # GCS bucket name - # CLI flag: -overrides.gcs.bucket-name - [bucket_name: | default = ""] - - # JSON either from a file or inline. - # CLI flag: -overrides.gcs.service-account - [service_account: | default = ""] - -azure: - # Azure storage account name - # CLI flag: -overrides.azure.account-name - [account_name: | default = ""] - - # Azure storage account key - # CLI flag: -overrides.azure.account-key - [account_key: | default = ""] - - # Azure storage container name - # CLI flag: -overrides.azure.container-name - [container_name: | default = ""] - - # Azure storage endpoint suffix without schema. The account name will be - # prefixed to this value to create the FQDN. If set to empty string, default - # endpoint suffix will be used. - # CLI flag: -overrides.azure.endpoint-suffix - [endpoint_suffix: | default = ""] - - # Azure storage max retry attempts - # CLI flag: -overrides.azure.max-retries - [max_retries: | default = 20] - - # Azure storage user domain - # CLI flag: -overrides.azure.user-domain - [user_domain: | default = ""] - - # Azure storage tenant ID - # CLI flag: -overrides.azure.tenant-id - [tenant_id: | default = ""] - - # Azure storage client ID - # CLI flag: -overrides.azure.client-id - [client_id: | default = ""] - - # Azure storage client secret - # CLI flag: -overrides.azure.client-secret - [client_secret: | default = ""] - - # Azure storage subscription ID - # CLI flag: -overrides.azure.subscription-id - [subscription_id: | default = ""] - - # Azure storage environment - # CLI flag: -overrides.azure.environment - [environment: | default = "AzurePublicCloud"] - - # Azure storage max retry attempts - # CLI flag: -overrides.azure.max-retries - [max_retries: | default = 20] - - # The time an idle connection will remain idle before closing. - # CLI flag: -overrides.azure.idle-conn-timeout - [idle_conn_timeout: | default = 1m30s] - - # The amount of time the client will wait for a servers response headers. - # CLI flag: -overrides.azure.response-header-timeout - [response_header_timeout: | default = 2m] - - # Maximum time to wait for a TLS handshake. 0 means no limit. - # CLI flag: -overrides.azure.tls-handshake-timeout - [tls_handshake_timeout: | default = 10s] - - # The time to wait for a server's first response headers after fully writing - # the request headers if the request has an Expect header. 0 to send the - # request body immediately. - # CLI flag: -overrides.azure.expect-continue-timeout - [expect_continue_timeout: | default = 1s] - - # Maximum number of idle connections across all hosts. 0 means no limit. - # CLI flag: -overrides.azure.max-idle-conns - [max_idle_conns: | default = 100] - - # Maximum number of idle connections per host. 0 means no limit. - # CLI flag: -overrides.azure.max-idle-conns-per-host - [max_idle_conns_per_host: | default = 100] - -swift: - # OpenStack Swift authentication API version. 0 to autodetect. - # CLI flag: -overrides.swift.auth-version - [auth_version: | default = 0] - - # OpenStack Swift authentication URL - # CLI flag: -overrides.swift.auth-url - [auth_url: | default = ""] - - # OpenStack Swift username - # CLI flag: -overrides.swift.username - [username: | default = ""] - - # OpenStack Swift user's domain name - # CLI flag: -overrides.swift.user-domain-name - [user_domain_name: | default = ""] - - # OpenStack Swift user's domain ID - # CLI flag: -overrides.swift.user-domain-id - [user_domain_id: | default = ""] - - # OpenStack Swift user ID - # CLI flag: -overrides.swift.user-id - [user_id: | default = ""] - - # OpenStack Swift user's password - # CLI flag: -overrides.swift.password - [password: | default = ""] - - # OpenStack Swift user's domain ID - # CLI flag: -overrides.swift.domain-id - [domain_id: | default = ""] - - # OpenStack Swift domain name - # CLI flag: -overrides.swift.domain-name - [domain_name: | default = ""] - - # OpenStack Swift project ID - # CLI flag: -overrides.swift.project-id - [project_id: | default = ""] - - # OpenStack Swift project name - # CLI flag: -overrides.swift.project-name - [project_name: | default = ""] - - # OpenStack Swift project domain ID - # CLI flag: -overrides.swift.project-domain-id - [project_domain_id: | default = ""] - - # OpenStack Swift project domain name - # CLI flag: -overrides.swift.project-domain-name - [project_domain_name: | default = ""] - - # OpenStack Swift region name - # CLI flag: -overrides.swift.region-name - [region_name: | default = ""] - - # OpenStack Swift container name - # CLI flag: -overrides.swift.container-name - [container_name: | default = ""] - - # OpenStack Swift max retry attempts - # CLI flag: -overrides.swift.max-retries - [max_retries: | default = 3] - - # OpenStack Swift connect timeout - # CLI flag: -overrides.swift.connect-timeout - [connect_timeout: | default = 10s] - - # OpenStack Swift request timeout - # CLI flag: -overrides.swift.request-timeout - [request_timeout: | default = 5s] -``` -``` - ### `ruler_storage_config` The `ruler_storage_config` configures the Cortex ruler storage backend. @@ -6328,6 +6371,7 @@ The `s3_sse_config` configures the S3 server-side encryption. The supported CLI - `alertmanager-storage` - `blocks-storage` +- `overrides` - `ruler-storage` - `runtime-config` diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index d86e61efa5f..f0b631df466 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -210,6 +210,10 @@ func (t *Cortex) initOverridesConfig() (services.Service, error) { } func (t *Cortex) initOverrides() (services.Service, error) { + if !t.Cfg.Overrides.Enabled { + return nil, nil + } + overridesAPI, err := overrides.New(t.Cfg.Overrides, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return nil, fmt.Errorf("failed to create overrides API: %w", err) From 608f1ab355c38199167fa10a08a20db14468a376 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Mon, 18 Aug 2025 20:17:17 +0300 Subject: [PATCH 04/11] remove from all target Signed-off-by: Bogdan Stancu --- docs/configuration/config-file-reference.md | 4 ---- integration/overrides_test.go | 10 ++++------ pkg/cortex/modules.go | 5 +---- pkg/overrides/overrides.go | 7 +------ pkg/overrides/overrides_test.go | 11 +---------- 5 files changed, 7 insertions(+), 30 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 581e43fe887..67ea1dccd9c 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -412,10 +412,6 @@ query_scheduler: [connect_timeout: | default = 5s] overrides: - # Enable the overrides API module - # CLI flag: -overrides.enabled - [enabled: | default = false] - # Path to the runtime configuration file that can be updated via the overrides # API # CLI flag: -overrides.runtime-config-file diff --git a/integration/overrides_test.go b/integration/overrides_test.go index 4c68d9b8f42..038bab9d680 100644 --- a/integration/overrides_test.go +++ b/integration/overrides_test.go @@ -56,8 +56,8 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { flags := mergeFlags( baseFlags, map[string]string{ - "-target": "overrides", - "-overrides.enabled": "true", + "-target": "overrides", + "-overrides.runtime-config-file": "runtime.yaml", "-overrides.backend": "s3", "-overrides.s3.access-key-id": e2edb.MinioAccessKey, @@ -224,8 +224,8 @@ func TestOverridesAPITenantExtraction(t *testing.T) { flags := mergeFlags( baseFlags, map[string]string{ - "-target": "overrides", - "-overrides.enabled": "true", + "-target": "overrides", + "-overrides.runtime-config-file": "runtime.yaml", "-overrides.backend": "s3", "-overrides.s3.access-key-id": e2edb.MinioAccessKey, @@ -278,7 +278,6 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { baseFlags, map[string]string{ "-target": "overrides", - "-overrides.enabled": "true", "-overrides.runtime-config-file": "runtime.yaml", "-overrides.backend": "filesystem", "-ring.store": "consul", @@ -303,7 +302,6 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { baseFlags, map[string]string{ "-target": "overrides", - "-overrides.enabled": "true", "-overrides.runtime-config-file": "runtime.yaml", "-ring.store": "consul", "-consul.hostname": "localhost:8500", diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index f0b631df466..110e9b03125 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -210,9 +210,6 @@ func (t *Cortex) initOverridesConfig() (services.Service, error) { } func (t *Cortex) initOverrides() (services.Service, error) { - if !t.Cfg.Overrides.Enabled { - return nil, nil - } overridesAPI, err := overrides.New(t.Cfg.Overrides, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { @@ -932,7 +929,7 @@ func (t *Cortex) setupModuleManager() error { TenantDeletion: {API, OverridesConfig}, Purger: {TenantDeletion}, TenantFederation: {Queryable}, - All: {QueryFrontend, Querier, Ingester, Distributor, Purger, StoreGateway, Ruler, Compactor, AlertManager, Overrides}, + All: {QueryFrontend, Querier, Ingester, Distributor, Purger, StoreGateway, Ruler, Compactor, AlertManager}, } if t.Cfg.ExternalPusher != nil && t.Cfg.ExternalQueryable != nil { deps[Ruler] = []string{OverridesConfig, RulerStorage} diff --git a/pkg/overrides/overrides.go b/pkg/overrides/overrides.go index 824a404c830..ce0569b09b7 100644 --- a/pkg/overrides/overrides.go +++ b/pkg/overrides/overrides.go @@ -28,8 +28,6 @@ const ( // Config holds configuration for the overrides module type Config struct { // Enable the overrides API module - // CLI flag: -overrides.enabled - Enabled bool `yaml:"enabled"` // Path to the runtime configuration file that can be updated via the overrides API // CLI flag: -overrides.runtime-config-file @@ -42,7 +40,7 @@ type Config struct { // RegisterFlags registers the overrides module flags func (c *Config) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&c.Enabled, "overrides.enabled", false, "Enable the overrides API module") + f.StringVar(&c.RuntimeConfigFile, "overrides.runtime-config-file", "runtime.yaml", "Path to the runtime configuration file that can be updated via the overrides API") c.RegisterFlagsWithPrefix("overrides.", f) @@ -50,9 +48,6 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { // Validate validates the configuration and returns an error if validation fails func (c *Config) Validate() error { - if !c.Enabled { - return nil - } if c.RuntimeConfigFile == "" { c.RuntimeConfigFile = "runtime.yaml" diff --git a/pkg/overrides/overrides_test.go b/pkg/overrides/overrides_test.go index f7942d5a715..28f42a2efc0 100644 --- a/pkg/overrides/overrides_test.go +++ b/pkg/overrides/overrides_test.go @@ -32,13 +32,12 @@ func TestConfig_Validate(t *testing.T) { }, "disabled config should pass": { initConfig: func(cfg *Config) { - cfg.Enabled = false + }, expected: nil, }, "enabled config should pass": { initConfig: func(cfg *Config) { - cfg.Enabled = true cfg.Config = bucket.Config{ Backend: bucket.S3, S3: s3.Config{ @@ -87,7 +86,6 @@ func TestNew(t *testing.T) { }{ "valid config should create API": { cfg: Config{ - Enabled: true, Config: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ @@ -123,7 +121,6 @@ func TestNew(t *testing.T) { func TestOverridesModuleServiceInterface(t *testing.T) { // Create the API instance with proper configuration cfg := Config{ - Enabled: true, Config: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ @@ -298,8 +295,6 @@ hard_overrides: // Create the API instance with proper configuration cfg := Config{ - Enabled: true, - Config: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ @@ -408,8 +403,6 @@ func TestAPITenantExtraction(t *testing.T) { // Create the API instance with proper configuration cfg := Config{ - Enabled: true, - Config: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ @@ -501,8 +494,6 @@ func TestAPIBucketErrors(t *testing.T) { // Create the API instance with proper configuration cfg := Config{ - Enabled: true, - Config: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ From 15f9f3cebcd55fad8f4f3758e3ec6744d12654d9 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Wed, 20 Aug 2025 22:14:47 +0300 Subject: [PATCH 05/11] remove consul and unneeded flags from test Signed-off-by: Bogdan Stancu --- integration/overrides_test.go | 95 ++++++++++++++--------------------- 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/integration/overrides_test.go b/integration/overrides_test.go index 038bab9d680..7601b12fc9f 100644 --- a/integration/overrides_test.go +++ b/integration/overrides_test.go @@ -25,9 +25,6 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { require.NoError(t, err) defer s.Close() - consul := e2edb.NewConsulWithName("consul") - require.NoError(t, s.StartAndWaitReady(consul)) - minio := e2edb.NewMinio(9000, "cortex") require.NoError(t, s.StartAndWaitReady(minio)) @@ -52,23 +49,17 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { require.NoError(t, s3Client.Upload(context.Background(), "runtime.yaml", bytes.NewReader(runtimeConfigData))) - baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) - flags := mergeFlags( - baseFlags, - map[string]string{ - "-target": "overrides", - - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "s3", - "-overrides.s3.access-key-id": e2edb.MinioAccessKey, - "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, - "-overrides.s3.bucket-name": "cortex", - "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), - "-overrides.s3.insecure": "true", - "-ring.store": "consul", - "-consul.hostname": consul.NetworkHTTPEndpoint(), - }, - ) + flags := map[string]string{ + "-target": "overrides", + + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "s3", + "-overrides.s3.access-key-id": e2edb.MinioAccessKey, + "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, + "-overrides.s3.bucket-name": "cortex", + "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-overrides.s3.insecure": "true", + } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides", flags, "") require.NoError(t, s.StartAndWaitReady(cortexSvc)) @@ -220,23 +211,19 @@ func TestOverridesAPITenantExtraction(t *testing.T) { minio := e2edb.NewMinio(9010, "cortex") require.NoError(t, s.StartAndWaitReady(minio)) - baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) - flags := mergeFlags( - baseFlags, - map[string]string{ - "-target": "overrides", - - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "s3", - "-overrides.s3.access-key-id": e2edb.MinioAccessKey, - "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, - "-overrides.s3.bucket-name": "cortex", - "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), - "-overrides.s3.insecure": "true", - "-ring.store": "consul", - "-consul.hostname": consul.NetworkHTTPEndpoint(), - }, - ) + flags := map[string]string{ + "-target": "overrides", + + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "s3", + "-overrides.s3.access-key-id": e2edb.MinioAccessKey, + "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, + "-overrides.s3.bucket-name": "cortex", + "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-overrides.s3.insecure": "true", + "-ring.store": "consul", + "-consul.hostname": consul.NetworkHTTPEndpoint(), + } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-tenant", flags, "") require.NoError(t, s.StartAndWaitReady(cortexSvc)) @@ -273,17 +260,13 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { defer s.Close() t.Run("filesystem backend should be rejected", func(t *testing.T) { - baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) - flags := mergeFlags( - baseFlags, - map[string]string{ - "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "filesystem", - "-ring.store": "consul", - "-consul.hostname": "localhost:8500", - }, - ) + flags := map[string]string{ + "-target": "overrides", + "-overrides.runtime-config-file": "runtime.yaml", + "-overrides.backend": "filesystem", + "-ring.store": "consul", + "-consul.hostname": "localhost:8500", + } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-filesystem", flags, "") @@ -297,16 +280,12 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { }) t.Run("no backend specified should be rejected", func(t *testing.T) { - baseFlags := mergeFlags(AlertmanagerLocalFlags(), BlocksStorageFlags()) - flags := mergeFlags( - baseFlags, - map[string]string{ - "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - "-ring.store": "consul", - "-consul.hostname": "localhost:8500", - }, - ) + flags := map[string]string{ + "-target": "overrides", + "-overrides.runtime-config-file": "runtime.yaml", + "-ring.store": "consul", + "-consul.hostname": "localhost:8500", + } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-no-backend", flags, "") From a2a7665fde280b46ab98696d136f257d1a7465c2 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Mon, 25 Aug 2025 15:57:50 +0300 Subject: [PATCH 06/11] missed some Signed-off-by: Bogdan Stancu --- integration/overrides_test.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/integration/overrides_test.go b/integration/overrides_test.go index 7601b12fc9f..e3e0781fe21 100644 --- a/integration/overrides_test.go +++ b/integration/overrides_test.go @@ -205,9 +205,6 @@ func TestOverridesAPITenantExtraction(t *testing.T) { require.NoError(t, err) defer s.Close() - consul := e2edb.NewConsulWithName("consul-tenant") - require.NoError(t, s.StartAndWaitReady(consul)) - minio := e2edb.NewMinio(9010, "cortex") require.NoError(t, s.StartAndWaitReady(minio)) @@ -221,8 +218,6 @@ func TestOverridesAPITenantExtraction(t *testing.T) { "-overrides.s3.bucket-name": "cortex", "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), "-overrides.s3.insecure": "true", - "-ring.store": "consul", - "-consul.hostname": consul.NetworkHTTPEndpoint(), } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-tenant", flags, "") @@ -264,8 +259,6 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { "-target": "overrides", "-overrides.runtime-config-file": "runtime.yaml", "-overrides.backend": "filesystem", - "-ring.store": "consul", - "-consul.hostname": "localhost:8500", } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-filesystem", flags, "") @@ -283,8 +276,6 @@ func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { flags := map[string]string{ "-target": "overrides", "-overrides.runtime-config-file": "runtime.yaml", - "-ring.store": "consul", - "-consul.hostname": "localhost:8500", } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-no-backend", flags, "") From 5e21b2234825b429d535d2b55c0112d2cfbade23 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Mon, 8 Sep 2025 15:21:26 +0300 Subject: [PATCH 07/11] suggestions Signed-off-by: Bogdan Stancu --- integration/overrides_test.go | 99 +++++++++-------------- pkg/api/api.go | 2 +- pkg/cortex/cortex.go | 2 - pkg/cortex/modules.go | 2 +- pkg/overrides/overrides.go | 136 ++------------------------------ pkg/overrides/overrides_test.go | 113 ++++++++++++++------------ 6 files changed, 110 insertions(+), 244 deletions(-) diff --git a/integration/overrides_test.go b/integration/overrides_test.go index e3e0781fe21..1b4297bec54 100644 --- a/integration/overrides_test.go +++ b/integration/overrides_test.go @@ -52,13 +52,13 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { flags := map[string]string{ "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "s3", - "-overrides.s3.access-key-id": e2edb.MinioAccessKey, - "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, - "-overrides.s3.bucket-name": "cortex", - "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), - "-overrides.s3.insecure": "true", + "-runtime-config.file": "runtime.yaml", + "-runtime-config.backend": "s3", + "-runtime-config.s3.access-key-id": e2edb.MinioAccessKey, + "-runtime-config.s3.secret-access-key": e2edb.MinioSecretKey, + "-runtime-config.s3.bucket-name": "cortex", + "-runtime-config.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-runtime-config.s3.insecure": "true", } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides", flags, "") @@ -100,7 +100,7 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { assert.Empty(t, overrides) }) - t.Run("PUT overrides for new user", func(t *testing.T) { + t.Run("POST overrides for new user", func(t *testing.T) { newOverrides := map[string]interface{}{ "ingestion_rate": 6000, "ingestion_burst_size": 7000, @@ -108,7 +108,7 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { requestBody, err := json.Marshal(newOverrides) require.NoError(t, err) - req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) require.NoError(t, err) req.Header.Set("X-Scope-OrgID", "user3") req.Header.Set("Content-Type", "application/json") @@ -137,14 +137,14 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { assert.Equal(t, float64(7000), savedOverrides["ingestion_burst_size"]) }) - t.Run("PUT overrides with invalid limit", func(t *testing.T) { + t.Run("POST overrides with invalid limit", func(t *testing.T) { invalidOverrides := map[string]interface{}{ "invalid_limit": 5000, } requestBody, err := json.Marshal(invalidOverrides) require.NoError(t, err) - req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) require.NoError(t, err) req.Header.Set("X-Scope-OrgID", "user4") req.Header.Set("Content-Type", "application/json") @@ -156,8 +156,8 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { assert.Equal(t, http.StatusBadRequest, resp.StatusCode) }) - t.Run("PUT overrides with invalid JSON", func(t *testing.T) { - req, err := http.NewRequest("PUT", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader([]byte("invalid json"))) + t.Run("POST overrides with invalid JSON", func(t *testing.T) { + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader([]byte("invalid json"))) require.NoError(t, err) req.Header.Set("X-Scope-OrgID", "user5") req.Header.Set("Content-Type", "application/json") @@ -208,16 +208,34 @@ func TestOverridesAPITenantExtraction(t *testing.T) { minio := e2edb.NewMinio(9010, "cortex") require.NoError(t, s.StartAndWaitReady(minio)) + // Upload an empty runtime config file to S3 + runtimeConfig := map[string]interface{}{ + "overrides": map[string]interface{}{}, + } + runtimeConfigData, err := yaml.Marshal(runtimeConfig) + require.NoError(t, err) + + s3Client, err := s3.NewBucketWithConfig(nil, s3.Config{ + Endpoint: minio.HTTPEndpoint(), + Insecure: true, + Bucket: "cortex", + AccessKey: e2edb.MinioAccessKey, + SecretKey: e2edb.MinioSecretKey, + }, "overrides-test-tenant", nil) + require.NoError(t, err) + + require.NoError(t, s3Client.Upload(context.Background(), "runtime.yaml", bytes.NewReader(runtimeConfigData))) + flags := map[string]string{ "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "s3", - "-overrides.s3.access-key-id": e2edb.MinioAccessKey, - "-overrides.s3.secret-access-key": e2edb.MinioSecretKey, - "-overrides.s3.bucket-name": "cortex", - "-overrides.s3.endpoint": minio.NetworkHTTPEndpoint(), - "-overrides.s3.insecure": "true", + "-runtime-config.file": "runtime.yaml", + "-runtime-config.backend": "s3", + "-runtime-config.s3.access-key-id": e2edb.MinioAccessKey, + "-runtime-config.s3.secret-access-key": e2edb.MinioSecretKey, + "-runtime-config.s3.bucket-name": "cortex", + "-runtime-config.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-runtime-config.s3.insecure": "true", } cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-tenant", flags, "") @@ -248,44 +266,3 @@ func TestOverridesAPITenantExtraction(t *testing.T) { require.NoError(t, s.Stop(cortexSvc)) } - -func TestOverridesAPIFilesystemBackendRejected(t *testing.T) { - s, err := e2e.NewScenario(networkName) - require.NoError(t, err) - defer s.Close() - - t.Run("filesystem backend should be rejected", func(t *testing.T) { - flags := map[string]string{ - "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - "-overrides.backend": "filesystem", - } - - cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-filesystem", flags, "") - - err = s.StartAndWaitReady(cortexSvc) - if err == nil { - t.Error("Expected Cortex to fail to start with filesystem backend, but it started successfully") - require.NoError(t, s.Stop(cortexSvc)) - } else { - t.Logf("Expected failure with filesystem backend: %v", err) - } - }) - - t.Run("no backend specified should be rejected", func(t *testing.T) { - flags := map[string]string{ - "-target": "overrides", - "-overrides.runtime-config-file": "runtime.yaml", - } - - cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-no-backend", flags, "") - - err = s.StartAndWaitReady(cortexSvc) - if err == nil { - t.Error("Expected Cortex to fail to start with no backend specified, but it started successfully") - require.NoError(t, s.Stop(cortexSvc)) - } else { - t.Logf("Expected failure with no backend specified: %v", err) - } - }) -} diff --git a/pkg/api/api.go b/pkg/api/api.go index e0887d6b22e..fa27b2bb154 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -390,7 +390,7 @@ func (a *API) RegisterRulerAPI(r *ruler.API) { func (a *API) RegisterOverrides(o *overrides.API) { // Register individual overrides API routes with the main API a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.GetOverrides), true, "GET") - a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.SetOverrides), true, "PUT") + a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.SetOverrides), true, "POST") a.RegisterRoute("/api/v1/user-overrides", http.HandlerFunc(o.DeleteOverrides), true, "DELETE") // Add link to the index page diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go index 6de1e11cbc9..4c7c2f031ce 100644 --- a/pkg/cortex/cortex.go +++ b/pkg/cortex/cortex.go @@ -127,7 +127,6 @@ type Config struct { RuntimeConfig runtimeconfig.Config `yaml:"runtime_config"` MemberlistKV memberlist.KVConfig `yaml:"memberlist"` QueryScheduler scheduler.Config `yaml:"query_scheduler"` - Overrides overrides.Config `yaml:"overrides"` Tracing tracing.Config `yaml:"tracing"` } @@ -177,7 +176,6 @@ func (c *Config) RegisterFlags(f *flag.FlagSet) { c.RuntimeConfig.RegisterFlags(f) c.MemberlistKV.RegisterFlags(f) c.QueryScheduler.RegisterFlags(f) - c.Overrides.RegisterFlags(f) c.Tracing.RegisterFlags(f) } diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index 110e9b03125..34e550813c9 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -211,7 +211,7 @@ func (t *Cortex) initOverridesConfig() (services.Service, error) { func (t *Cortex) initOverrides() (services.Service, error) { - overridesAPI, err := overrides.New(t.Cfg.Overrides, util_log.Logger, prometheus.DefaultRegisterer) + overridesAPI, err := overrides.New(t.Cfg.RuntimeConfig, util_log.Logger, prometheus.DefaultRegisterer) if err != nil { return nil, fmt.Errorf("failed to create overrides API: %w", err) } diff --git a/pkg/overrides/overrides.go b/pkg/overrides/overrides.go index ce0569b09b7..47f38e90e9c 100644 --- a/pkg/overrides/overrides.go +++ b/pkg/overrides/overrides.go @@ -2,10 +2,7 @@ package overrides import ( "context" - "errors" - "flag" "fmt" - "time" "github.com/go-kit/log" "github.com/go-kit/log/level" @@ -13,143 +10,26 @@ import ( "github.com/thanos-io/objstore" "github.com/cortexproject/cortex/pkg/storage/bucket" + "github.com/cortexproject/cortex/pkg/util/runtimeconfig" "github.com/cortexproject/cortex/pkg/util/services" ) const ( - // Error messages - ErrNoStorageBackendSpecified = "overrides module requires a storage backend to be specified" - ErrFilesystemBackendNotSupported = "filesystem backend is not supported for overrides module; use S3, GCS, Azure, or Swift instead" - ErrInvalidBucketConfiguration = "invalid bucket configuration" ErrInvalidOverridesConfiguration = "invalid overrides configuration" ErrFailedToCreateBucketClient = "failed to create bucket client for overrides" ) -// Config holds configuration for the overrides module -type Config struct { - // Enable the overrides API module - - // Path to the runtime configuration file that can be updated via the overrides API - // CLI flag: -overrides.runtime-config-file - RuntimeConfigFile string `yaml:"runtime_config_file"` - - // Storage configuration for the runtime config file - // All bucket backends (S3, GCS, Azure, Swift) are supported, but not filesystem - bucket.Config `yaml:",inline"` -} - -// RegisterFlags registers the overrides module flags -func (c *Config) RegisterFlags(f *flag.FlagSet) { - - f.StringVar(&c.RuntimeConfigFile, "overrides.runtime-config-file", "runtime.yaml", "Path to the runtime configuration file that can be updated via the overrides API") - - c.RegisterFlagsWithPrefix("overrides.", f) -} - -// Validate validates the configuration and returns an error if validation fails -func (c *Config) Validate() error { - - if c.RuntimeConfigFile == "" { - c.RuntimeConfigFile = "runtime.yaml" - } - - if c.Backend == "" { - c.Backend = bucket.S3 - } - - if c.Backend == bucket.Filesystem { - return errors.New(ErrFilesystemBackendNotSupported) - } - - if c.Backend == bucket.S3 { - if c.S3.SignatureVersion == "" { - c.S3.SignatureVersion = "v4" - } - if c.S3.BucketLookupType == "" { - c.S3.BucketLookupType = "auto" - } - if !c.S3.SendContentMd5 { - c.S3.SendContentMd5 = true - } - if c.S3.HTTP.IdleConnTimeout == 0 { - c.S3.HTTP.IdleConnTimeout = 90 * time.Second - } - if c.S3.HTTP.ResponseHeaderTimeout == 0 { - c.S3.HTTP.ResponseHeaderTimeout = 2 * time.Minute - } - if c.S3.HTTP.TLSHandshakeTimeout == 0 { - c.S3.HTTP.TLSHandshakeTimeout = 10 * time.Second - } - if c.S3.HTTP.ExpectContinueTimeout == 0 { - c.S3.HTTP.ExpectContinueTimeout = 1 * time.Second - } - if c.S3.HTTP.MaxIdleConns == 0 { - c.S3.HTTP.MaxIdleConns = 100 - } - if c.S3.HTTP.MaxIdleConnsPerHost == 0 { - c.S3.HTTP.MaxIdleConnsPerHost = 100 - } - } - - if c.Backend == bucket.Azure { - if c.Azure.MaxRetries == 0 { - c.Azure.MaxRetries = 20 - } - if c.Azure.IdleConnTimeout == 0 { - c.Azure.IdleConnTimeout = 90 * time.Second - } - if c.Azure.ResponseHeaderTimeout == 0 { - c.Azure.ResponseHeaderTimeout = 2 * time.Minute - } - if c.Azure.TLSHandshakeTimeout == 0 { - c.Azure.TLSHandshakeTimeout = 10 * time.Second - } - if c.Azure.ExpectContinueTimeout == 0 { - c.Azure.ExpectContinueTimeout = 1 * time.Second - } - if c.Azure.MaxIdleConns == 0 { - c.Azure.MaxIdleConns = 100 - } - if c.Azure.MaxIdleConnsPerHost == 0 { - c.Azure.MaxIdleConnsPerHost = 100 - } - } - - if c.Backend == bucket.Swift { - if c.Swift.AuthVersion == 0 { - c.Swift.AuthVersion = 0 - } - if c.Swift.MaxRetries == 0 { - c.Swift.MaxRetries = 3 - } - if c.Swift.ConnectTimeout == 0 { - c.Swift.ConnectTimeout = 10 * time.Second - } - if c.Swift.RequestTimeout == 0 { - c.Swift.RequestTimeout = 5 * time.Second - } - } - - if err := c.Config.Validate(); err != nil { - return fmt.Errorf("%s: %w", ErrInvalidBucketConfiguration, err) - } - - return nil -} - -// API represents the overrides API module type API struct { services.Service - cfg Config + cfg runtimeconfig.Config logger log.Logger registerer prometheus.Registerer bucketClient objstore.Bucket runtimeConfigPath string } -// New creates a new overrides API instance -func New(cfg Config, logger log.Logger, registerer prometheus.Registerer) (*API, error) { - if err := cfg.Validate(); err != nil { +func New(cfg runtimeconfig.Config, logger log.Logger, registerer prometheus.Registerer) (*API, error) { + if err := cfg.StorageConfig.Validate(); err != nil { return nil, fmt.Errorf("%s: %w", ErrInvalidOverridesConfiguration, err) } @@ -163,18 +43,18 @@ func New(cfg Config, logger log.Logger, registerer prometheus.Registerer) (*API, } func (a *API) starting(ctx context.Context) error { - level.Info(a.logger).Log("msg", "overrides API starting", "runtime_config_file", a.cfg.RuntimeConfigFile, "backend", a.cfg.Backend) + level.Info(a.logger).Log("msg", "overrides API starting", "runtime_config_file", a.cfg.LoadPath, "backend", a.cfg.StorageConfig.Backend) - bucketClient, err := bucket.NewClient(ctx, a.cfg.Config, nil, "overrides", a.logger, a.registerer) + bucketClient, err := bucket.NewClient(ctx, a.cfg.StorageConfig, nil, "overrides", a.logger, a.registerer) if err != nil { level.Error(a.logger).Log("msg", ErrFailedToCreateBucketClient, "err", err) return fmt.Errorf("%s: %w", ErrFailedToCreateBucketClient, err) } a.bucketClient = bucketClient - a.runtimeConfigPath = a.cfg.RuntimeConfigFile + a.runtimeConfigPath = a.cfg.LoadPath - level.Info(a.logger).Log("msg", "overrides API started successfully", "backend", a.cfg.Backend) + level.Info(a.logger).Log("msg", "overrides API started successfully", "backend", a.cfg.StorageConfig.Backend) return nil } diff --git a/pkg/overrides/overrides_test.go b/pkg/overrides/overrides_test.go index 28f42a2efc0..5b887378820 100644 --- a/pkg/overrides/overrides_test.go +++ b/pkg/overrides/overrides_test.go @@ -17,28 +17,26 @@ import ( "github.com/cortexproject/cortex/pkg/storage/bucket/s3" "github.com/cortexproject/cortex/pkg/util/flagext" "github.com/cortexproject/cortex/pkg/util/log" + "github.com/cortexproject/cortex/pkg/util/runtimeconfig" "github.com/cortexproject/cortex/pkg/util/services" ) func TestConfig_Validate(t *testing.T) { t.Parallel() tests := map[string]struct { - initConfig func(*Config) + initConfig func(*runtimeconfig.Config) expected error }{ "default config should pass": { - initConfig: func(_ *Config) {}, - expected: nil, - }, - "disabled config should pass": { - initConfig: func(cfg *Config) { - + initConfig: func(cfg *runtimeconfig.Config) { + // Set default values for bucket config + flagext.DefaultValues(&cfg.StorageConfig) }, expected: nil, }, - "enabled config should pass": { - initConfig: func(cfg *Config) { - cfg.Config = bucket.Config{ + "s3 config should pass": { + initConfig: func(cfg *runtimeconfig.Config) { + cfg.StorageConfig = bucket.Config{ Backend: bucket.S3, S3: s3.Config{ AccessKeyID: "test-access-key", @@ -48,6 +46,8 @@ func TestConfig_Validate(t *testing.T) { Insecure: true, }, } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) }, expected: nil, }, @@ -56,21 +56,21 @@ func TestConfig_Validate(t *testing.T) { for testName, testData := range tests { t.Run(testName, func(t *testing.T) { t.Parallel() - cfg := Config{} + cfg := runtimeconfig.Config{} testData.initConfig(&cfg) if testData.expected == nil { - assert.NoError(t, cfg.Validate()) + assert.NoError(t, cfg.StorageConfig.Validate()) } else { - assert.ErrorIs(t, cfg.Validate(), testData.expected) + assert.ErrorIs(t, cfg.StorageConfig.Validate(), testData.expected) } }) } } func TestConfig_RegisterFlags(t *testing.T) { - cfg := Config{} + cfg := runtimeconfig.Config{} // Test that flags are registered without panicking require.NotPanics(t, func() { @@ -81,22 +81,27 @@ func TestConfig_RegisterFlags(t *testing.T) { func TestNew(t *testing.T) { tests := map[string]struct { - cfg Config + cfg runtimeconfig.Config expectError bool }{ "valid config should create API": { - cfg: Config{ - Config: bucket.Config{ - Backend: bucket.S3, - S3: s3.Config{ - AccessKeyID: "test-access-key", - SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, - BucketName: "test-bucket", - Endpoint: "localhost:9000", - Insecure: true, + cfg: func() runtimeconfig.Config { + cfg := runtimeconfig.Config{ + StorageConfig: bucket.Config{ + Backend: bucket.S3, + S3: s3.Config{ + AccessKeyID: "test-access-key", + SecretAccessKey: flagext.Secret{Value: "test-secret-key"}, + BucketName: "test-bucket", + Endpoint: "localhost:9000", + Insecure: true, + }, }, - }, - }, + } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) + return cfg + }(), expectError: false, }, } @@ -111,8 +116,6 @@ func TestNew(t *testing.T) { } else { assert.NoError(t, err) assert.NotNil(t, api) - // Don't compare the entire config since defaults may modify it - // Just verify the API was created successfully } }) } @@ -120,8 +123,8 @@ func TestNew(t *testing.T) { func TestOverridesModuleServiceInterface(t *testing.T) { // Create the API instance with proper configuration - cfg := Config{ - Config: bucket.Config{ + cfg := runtimeconfig.Config{ + StorageConfig: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ AccessKeyID: "test-access-key", @@ -132,6 +135,8 @@ func TestOverridesModuleServiceInterface(t *testing.T) { }, }, } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) require.NoError(t, err) require.NotNil(t, api) @@ -205,16 +210,16 @@ func TestAPIEndpoints(t *testing.T) { }, }, { - name: "PUT overrides - no tenant ID", - method: "PUT", + name: "POST overrides - no tenant ID", + method: "POST", path: "/api/v1/user-overrides", tenantID: "", requestBody: map[string]interface{}{"ingestion_rate": 5000}, expectedStatus: http.StatusUnauthorized, }, { - name: "PUT overrides - valid tenant ID, valid overrides", - method: "PUT", + name: "POST overrides - valid tenant ID, valid overrides", + method: "POST", path: "/api/v1/user-overrides", tenantID: "user789", requestBody: map[string]interface{}{"ingestion_rate": 5000, "ruler_max_rules_per_rule_group": 10}, @@ -226,8 +231,8 @@ func TestAPIEndpoints(t *testing.T) { }, }, { - name: "PUT overrides - invalid limit name", - method: "PUT", + name: "POST overrides - invalid limit name", + method: "POST", path: "/api/v1/user-overrides", tenantID: "user999", requestBody: map[string]interface{}{"invalid_limit": 5000}, @@ -235,16 +240,16 @@ func TestAPIEndpoints(t *testing.T) { }, { - name: "PUT overrides - invalid JSON", - method: "PUT", + name: "POST overrides - invalid JSON", + method: "POST", path: "/api/v1/user-overrides", tenantID: "user999", requestBody: "invalid json", expectedStatus: http.StatusBadRequest, }, { - name: "PUT overrides - exceeding hard limit from runtime config", - method: "PUT", + name: "POST overrides - exceeding hard limit from runtime config", + method: "POST", path: "/api/v1/user-overrides", tenantID: "user999", requestBody: map[string]interface{}{"ingestion_rate": 1500000}, // Exceeds hard limit of 1000000 @@ -294,8 +299,8 @@ hard_overrides: } // Create the API instance with proper configuration - cfg := Config{ - Config: bucket.Config{ + cfg := runtimeconfig.Config{ + StorageConfig: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ AccessKeyID: "test-access-key", @@ -306,6 +311,8 @@ hard_overrides: }, }, } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) require.NoError(t, err) require.NotNil(t, api) @@ -341,7 +348,7 @@ hard_overrides: switch tt.method { case "GET": api.GetOverrides(recorder, req) - case "PUT": + case "POST": api.SetOverrides(recorder, req) case "DELETE": api.DeleteOverrides(recorder, req) @@ -402,8 +409,8 @@ func TestAPITenantExtraction(t *testing.T) { } // Create the API instance with proper configuration - cfg := Config{ - Config: bucket.Config{ + cfg := runtimeconfig.Config{ + StorageConfig: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ AccessKeyID: "test-access-key", @@ -414,6 +421,8 @@ func TestAPITenantExtraction(t *testing.T) { }, }, } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) require.NoError(t, err) require.NotNil(t, api) @@ -463,8 +472,8 @@ func TestAPIBucketErrors(t *testing.T) { expectedStatus: http.StatusOK, // Current implementation treats errors as "not found" }, { - name: "PUT overrides - bucket upload error", - method: "PUT", + name: "POST overrides - bucket upload error", + method: "POST", tenantID: "user456", setupMock: func(mock *bucket.ClientMock) { // First read succeeds, then upload fails @@ -493,8 +502,8 @@ func TestAPIBucketErrors(t *testing.T) { tt.setupMock(mockBucket) // Create the API instance with proper configuration - cfg := Config{ - Config: bucket.Config{ + cfg := runtimeconfig.Config{ + StorageConfig: bucket.Config{ Backend: bucket.S3, S3: s3.Config{ AccessKeyID: "test-access-key", @@ -505,6 +514,8 @@ func TestAPIBucketErrors(t *testing.T) { }, }, } + // Set default values before validation + flagext.DefaultValues(&cfg.StorageConfig.S3) api, err := New(cfg, log.Logger, prometheus.DefaultRegisterer) require.NoError(t, err) require.NotNil(t, api) @@ -515,7 +526,7 @@ func TestAPIBucketErrors(t *testing.T) { // Create the request var req *http.Request - if tt.method == "PUT" { + if tt.method == "POST" { requestBody := map[string]interface{}{"ingestion_rate": 5000} body, err := json.Marshal(requestBody) require.NoError(t, err) @@ -534,7 +545,7 @@ func TestAPIBucketErrors(t *testing.T) { switch tt.method { case "GET": api.GetOverrides(recorder, req) - case "PUT": + case "POST": api.SetOverrides(recorder, req) case "DELETE": api.DeleteOverrides(recorder, req) From dff25e38ae88f657d13a104f8a89e04644b499a1 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Sat, 20 Sep 2025 22:46:54 +0300 Subject: [PATCH 08/11] sugestions from dsabsay Signed-off-by: Bogdan Stancu --- integration/overrides_test.go | 120 +++++++++++++++++++++++++++++--- pkg/overrides/api.go | 51 ++++++++------ pkg/overrides/limits.go | 36 ++++------ pkg/overrides/overrides_test.go | 18 ++++- 4 files changed, 168 insertions(+), 57 deletions(-) diff --git a/integration/overrides_test.go b/integration/overrides_test.go index 22e00285e44..1b2c2239660 100644 --- a/integration/overrides_test.go +++ b/integration/overrides_test.go @@ -99,13 +99,7 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { require.NoError(t, err) defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - - var overrides map[string]interface{} - err = json.NewDecoder(resp.Body).Decode(&overrides) - require.NoError(t, err) - - assert.Empty(t, overrides) + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) }) t.Run("POST overrides for new user", func(t *testing.T) { @@ -196,13 +190,119 @@ func TestOverridesAPIWithRunningCortex(t *testing.T) { require.NoError(t, err) defer resp.Body.Close() + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + require.NoError(t, s.Stop(cortexSvc)) +} + +func TestOverridesAPIHardLimits(t *testing.T) { + s, err := e2e.NewScenario(networkName) + require.NoError(t, err) + defer s.Close() + + minio := e2edb.NewMinio(9001, "cortex") + require.NoError(t, s.StartAndWaitReady(minio)) + + // Runtime config with hard limits + runtimeConfig := map[string]interface{}{ + "overrides": map[string]interface{}{}, + "hard_overrides": map[string]interface{}{ + "user1": map[string]interface{}{ + "ingestion_rate": 10000, + "max_global_series_per_user": 50000, + }, + }, + "api_allowed_limits": []string{ + "ingestion_rate", + "max_global_series_per_user", + }, + } + runtimeConfigData, err := yaml.Marshal(runtimeConfig) + require.NoError(t, err) + + s3Client, err := s3.NewBucketWithConfig(nil, s3.Config{ + Endpoint: minio.HTTPEndpoint(), + Insecure: true, + Bucket: "cortex", + AccessKey: e2edb.MinioAccessKey, + SecretKey: e2edb.MinioSecretKey, + }, "overrides-test-hard-limits", nil) + require.NoError(t, err) + + require.NoError(t, s3Client.Upload(context.Background(), "runtime.yaml", bytes.NewReader(runtimeConfigData))) + + flags := map[string]string{ + "-target": "overrides", + + "-runtime-config.file": "runtime.yaml", + "-runtime-config.backend": "s3", + "-runtime-config.s3.access-key-id": e2edb.MinioAccessKey, + "-runtime-config.s3.secret-access-key": e2edb.MinioSecretKey, + "-runtime-config.s3.bucket-name": "cortex", + "-runtime-config.s3.endpoint": minio.NetworkHTTPEndpoint(), + "-runtime-config.s3.insecure": "true", + } + + cortexSvc := e2ecortex.NewSingleBinary("cortex-overrides-hard-limits", flags, "") + require.NoError(t, s.StartAndWaitReady(cortexSvc)) + + t.Run("POST overrides within hard limits", func(t *testing.T) { + overrides := map[string]interface{}{ + "ingestion_rate": 5000, // Within hard limit of 10000 + "max_global_series_per_user": 25000, // Within hard limit of 50000 + } + requestBody, err := json.Marshal(overrides) + require.NoError(t, err) + + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user1") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + assert.Equal(t, http.StatusOK, resp.StatusCode) + }) - var overrides map[string]interface{} - err = json.NewDecoder(resp.Body).Decode(&overrides) + t.Run("POST overrides exceeding hard limits", func(t *testing.T) { + overrides := map[string]interface{}{ + "ingestion_rate": 15000, // Exceeds hard limit of 10000 + } + requestBody, err := json.Marshal(overrides) + require.NoError(t, err) + + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user1") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + t.Run("POST overrides for user without hard limits", func(t *testing.T) { + overrides := map[string]interface{}{ + "ingestion_rate": 20000, // No hard limits for user2 + } + requestBody, err := json.Marshal(overrides) + require.NoError(t, err) + + req, err := http.NewRequest("POST", "http://"+cortexSvc.HTTPEndpoint()+"/api/v1/user-overrides", bytes.NewReader(requestBody)) + require.NoError(t, err) + req.Header.Set("X-Scope-OrgID", "user2") + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) require.NoError(t, err) + defer resp.Body.Close() - assert.Empty(t, overrides) + assert.Equal(t, http.StatusOK, resp.StatusCode) }) require.NoError(t, s.Stop(cortexSvc)) diff --git a/pkg/overrides/api.go b/pkg/overrides/api.go index 6b718530dc5..205ccb13801 100644 --- a/pkg/overrides/api.go +++ b/pkg/overrides/api.go @@ -16,14 +16,9 @@ import ( ) const ( - // HTTP status codes - StatusOK = 200 - StatusBadRequest = 400 - StatusUnauthorized = 401 - StatusInternalServerError = 500 - // Error messages - ErrInvalidJSON = "Invalid JSON" + ErrInvalidJSON = "Invalid JSON" + ErrUserNotFound = "User not found" // Runtime config errors ErrRuntimeConfig = "runtime config read error" @@ -39,7 +34,8 @@ func (a *API) getAllowedLimitsFromBucket(ctx context.Context) ([]string, error) var config runtimeconfig.RuntimeConfigValues if err := yaml.NewDecoder(reader).Decode(&config); err != nil { - return []string{}, nil // No allowed limits if config can't be decoded + level.Error(a.logger).Log("msg", "failed to decode runtime config", "err", err) + return []string{}, fmt.Errorf("failed to decode runtime config") } return config.APIAllowedLimits, nil @@ -49,14 +45,19 @@ func (a *API) getAllowedLimitsFromBucket(ctx context.Context) ([]string, error) func (a *API) GetOverrides(w http.ResponseWriter, r *http.Request) { userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) if err != nil { - http.Error(w, err.Error(), StatusUnauthorized) + http.Error(w, err.Error(), http.StatusUnauthorized) return } // Read overrides from bucket storage overrides, err := a.getOverridesFromBucket(r.Context(), userID) if err != nil { - http.Error(w, err.Error(), StatusInternalServerError) + if err.Error() == ErrUserNotFound { + http.Error(w, "User not found", http.StatusBadRequest) + } else { + level.Error(a.logger).Log("msg", "failed to get overrides from bucket", "userID", userID, "err", err) + http.Error(w, "Internal server error", http.StatusInternalServerError) + } return } @@ -72,65 +73,70 @@ func (a *API) GetOverrides(w http.ResponseWriter, r *http.Request) { func (a *API) SetOverrides(w http.ResponseWriter, r *http.Request) { userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) if err != nil { - http.Error(w, err.Error(), StatusUnauthorized) + http.Error(w, err.Error(), http.StatusUnauthorized) return } var overrides map[string]interface{} if err := json.NewDecoder(r.Body).Decode(&overrides); err != nil { - http.Error(w, ErrInvalidJSON, StatusBadRequest) + http.Error(w, ErrInvalidJSON, http.StatusBadRequest) return } // Get allowed limits from runtime config allowedLimits, err := a.getAllowedLimitsFromBucket(r.Context()) if err != nil { - http.Error(w, "Failed to read allowed limits", StatusInternalServerError) + level.Error(a.logger).Log("msg", "failed to get allowed limits from bucket", "userID", userID, "err", err) + http.Error(w, "Internal server error", http.StatusInternalServerError) return } // Validate that only allowed limits are being changed if err := ValidateOverrides(overrides, allowedLimits); err != nil { - http.Error(w, err.Error(), StatusBadRequest) + level.Error(a.logger).Log("msg", "invalid overrides validation", "userID", userID, "err", err) + http.Error(w, "Invalid overrides", http.StatusBadRequest) return } // Validate that values don't exceed hard limits from runtime config if err := a.validateHardLimits(overrides, userID); err != nil { - http.Error(w, err.Error(), StatusBadRequest) + level.Error(a.logger).Log("msg", "hard limits validation failed", "userID", userID, "err", err) + http.Error(w, "Invalid overrides", http.StatusBadRequest) return } // Write overrides to bucket storage if err := a.setOverridesToBucket(r.Context(), userID, overrides); err != nil { - http.Error(w, err.Error(), StatusInternalServerError) + level.Error(a.logger).Log("msg", "failed to set overrides to bucket", "userID", userID, "err", err) + http.Error(w, "Internal server error", http.StatusInternalServerError) return } - w.WriteHeader(StatusOK) + w.WriteHeader(http.StatusOK) } // DeleteOverrides removes tenant-specific overrides func (a *API) DeleteOverrides(w http.ResponseWriter, r *http.Request) { userID, _, err := tenant.ExtractTenantIDFromHTTPRequest(r) if err != nil { - http.Error(w, err.Error(), StatusUnauthorized) + http.Error(w, err.Error(), http.StatusUnauthorized) return } if err := a.deleteOverridesFromBucket(r.Context(), userID); err != nil { - http.Error(w, err.Error(), StatusInternalServerError) + level.Error(a.logger).Log("msg", "failed to delete overrides from bucket", "userID", userID, "err", err) + http.Error(w, "Internal server error", http.StatusInternalServerError) return } - w.WriteHeader(StatusOK) + w.WriteHeader(http.StatusOK) } // getOverridesFromBucket reads overrides for a specific tenant from the runtime config file func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[string]interface{}, error) { reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) if err != nil { - return map[string]interface{}{}, nil + return nil, fmt.Errorf("failed to get runtime config: %w", err) } defer reader.Close() @@ -155,8 +161,11 @@ func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[st return result, nil } + // User does not exist in config - return error + return nil, fmt.Errorf(ErrUserNotFound) } + // No tenant limits configured - return empty map (no overrides) return map[string]interface{}{}, nil } diff --git a/pkg/overrides/limits.go b/pkg/overrides/limits.go index 211051a73f7..499109d5104 100644 --- a/pkg/overrides/limits.go +++ b/pkg/overrides/limits.go @@ -3,9 +3,11 @@ package overrides import ( "context" "fmt" + "slices" "strconv" "strings" + "github.com/go-kit/log/level" "gopkg.in/yaml.v3" "github.com/cortexproject/cortex/pkg/util/runtimeconfig" @@ -23,7 +25,7 @@ func ValidateOverrides(overrides map[string]interface{}, allowedLimits []string) var invalidLimits []string for limitName := range overrides { - if !IsLimitAllowed(limitName, allowedLimits) { + if !slices.Contains(allowedLimits, limitName) { invalidLimits = append(invalidLimits, limitName) } } @@ -35,39 +37,23 @@ func ValidateOverrides(overrides map[string]interface{}, allowedLimits []string) return nil } -// GetAllowedLimits returns the allowed limits from runtime config -// If no allowed limits are configured, returns empty slice (no limits allowed) -func GetAllowedLimits(allowedLimits []string) []string { - return allowedLimits -} - -// IsLimitAllowed checks if a specific limit can be modified -func IsLimitAllowed(limitName string, allowedLimits []string) bool { - for _, allowed := range allowedLimits { - if allowed == limitName { - return true - } - } - return false -} - // validateHardLimits checks if the provided overrides exceed any hard limits from the runtime config func (a *API) validateHardLimits(overrides map[string]interface{}, userID string) error { // Read the runtime config to get hard limits reader, err := a.bucketClient.Get(context.Background(), a.runtimeConfigPath) if err != nil { - // If we can't read the config, skip hard limit validation - return nil + level.Error(a.logger).Log("msg", "failed to read hard limits configuration", "userID", userID, "err", err) + return fmt.Errorf("failed to validate hard limits") } defer reader.Close() var config runtimeconfig.RuntimeConfigValues if err := yaml.NewDecoder(reader).Decode(&config); err != nil { - // If we can't decode the config, skip hard limit validation - return nil + level.Error(a.logger).Log("msg", "failed to decode hard limits configuration", "userID", userID, "err", err) + return fmt.Errorf("failed to validate hard limits") } - // If no hard overrides are defined, skip validation + // If no hard overrides are defined, allow the request if config.HardTenantLimits == nil { return nil } @@ -80,12 +66,14 @@ func (a *API) validateHardLimits(overrides map[string]interface{}, userID string yamlData, err := yaml.Marshal(userHardLimits) if err != nil { - return nil // Skip validation if we can't marshal + level.Error(a.logger).Log("msg", "failed to marshal hard limits", "userID", userID, "err", err) + return fmt.Errorf("failed to validate hard limits") } var hardLimitsMap map[string]interface{} if err := yaml.Unmarshal(yamlData, &hardLimitsMap); err != nil { - return nil // Skip validation if we can't unmarshal + level.Error(a.logger).Log("msg", "failed to unmarshal hard limits", "userID", userID, "err", err) + return fmt.Errorf("failed to validate hard limits") } // Validate each override against the user's hard limits diff --git a/pkg/overrides/overrides_test.go b/pkg/overrides/overrides_test.go index 23dc1e3d952..a6d5d534935 100644 --- a/pkg/overrides/overrides_test.go +++ b/pkg/overrides/overrides_test.go @@ -209,6 +209,20 @@ func TestAPIEndpoints(t *testing.T) { assert.Equal(t, float64(100000), response["max_global_series_per_user"]) }, }, + { + name: "GET overrides - valid tenant ID, user does not exist", + method: "GET", + path: "/api/v1/user-overrides", + tenantID: "nonexistent_user", + expectedStatus: http.StatusBadRequest, + setupMock: func(mock *bucket.ClientMock) { + // Mock runtime config with different user + overridesData := `overrides: + other_user: + ingestion_rate: 5000` + mock.MockGet("runtime.yaml", overridesData, nil) + }, + }, { name: "POST overrides - no tenant ID", method: "POST", @@ -491,13 +505,13 @@ func TestAPIBucketErrors(t *testing.T) { expectedStatus int }{ { - name: "GET overrides - bucket error treated as not found", + name: "GET overrides - bucket error returns internal server error", method: "GET", tenantID: "user123", setupMock: func(mock *bucket.ClientMock) { mock.MockGet("runtime.yaml", "", fmt.Errorf("bucket error")) }, - expectedStatus: http.StatusOK, // Current implementation treats errors as "not found" + expectedStatus: http.StatusInternalServerError, }, { name: "POST overrides - bucket upload error", From 8e50e9a00c17c335483f63eb82ab4b43b641f990 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Sat, 20 Sep 2025 23:14:43 +0300 Subject: [PATCH 09/11] lint Signed-off-by: Bogdan Stancu --- pkg/overrides/api.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/overrides/api.go b/pkg/overrides/api.go index 205ccb13801..dadd7c12f4d 100644 --- a/pkg/overrides/api.go +++ b/pkg/overrides/api.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "net/http" @@ -162,7 +163,7 @@ func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[st return result, nil } // User does not exist in config - return error - return nil, fmt.Errorf(ErrUserNotFound) + return nil, errors.New(ErrUserNotFound) } // No tenant limits configured - return empty map (no overrides) From 06195e0e12ae9ba7d91f22ae557cc99cf1d464aa Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Sat, 20 Sep 2025 23:35:09 +0300 Subject: [PATCH 10/11] lint2 Signed-off-by: Bogdan Stancu --- pkg/overrides/api.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/overrides/api.go b/pkg/overrides/api.go index dadd7c12f4d..25f6504dc47 100644 --- a/pkg/overrides/api.go +++ b/pkg/overrides/api.go @@ -18,8 +18,8 @@ import ( const ( // Error messages - ErrInvalidJSON = "Invalid JSON" - ErrUserNotFound = "User not found" + ErrInvalidJSON = "invalid JSON" + ErrUserNotFound = "user not found" // Runtime config errors ErrRuntimeConfig = "runtime config read error" @@ -54,7 +54,7 @@ func (a *API) GetOverrides(w http.ResponseWriter, r *http.Request) { overrides, err := a.getOverridesFromBucket(r.Context(), userID) if err != nil { if err.Error() == ErrUserNotFound { - http.Error(w, "User not found", http.StatusBadRequest) + http.Error(w, "user not found", http.StatusBadRequest) } else { level.Error(a.logger).Log("msg", "failed to get overrides from bucket", "userID", userID, "err", err) http.Error(w, "Internal server error", http.StatusInternalServerError) From 8ba408bc067b2ab542a8157567393761d72ce6c0 Mon Sep 17 00:00:00 2001 From: Bogdan Stancu Date: Sat, 20 Sep 2025 23:49:14 +0300 Subject: [PATCH 11/11] modernize? Signed-off-by: Bogdan Stancu --- pkg/overrides/api.go | 10 +++++----- pkg/overrides/limits.go | 10 +++++----- pkg/overrides/overrides_test.go | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pkg/overrides/api.go b/pkg/overrides/api.go index 25f6504dc47..c7b4b6656ae 100644 --- a/pkg/overrides/api.go +++ b/pkg/overrides/api.go @@ -78,7 +78,7 @@ func (a *API) SetOverrides(w http.ResponseWriter, r *http.Request) { return } - var overrides map[string]interface{} + var overrides map[string]any if err := json.NewDecoder(r.Body).Decode(&overrides); err != nil { http.Error(w, ErrInvalidJSON, http.StatusBadRequest) return @@ -134,7 +134,7 @@ func (a *API) DeleteOverrides(w http.ResponseWriter, r *http.Request) { } // getOverridesFromBucket reads overrides for a specific tenant from the runtime config file -func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[string]interface{}, error) { +func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[string]any, error) { reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) if err != nil { return nil, fmt.Errorf("failed to get runtime config: %w", err) @@ -155,7 +155,7 @@ func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[st return nil, fmt.Errorf("failed to marshal limits: %w", err) } - var result map[string]interface{} + var result map[string]any if err := yaml.Unmarshal(yamlData, &result); err != nil { return nil, fmt.Errorf("failed to unmarshal limits: %w", err) } @@ -167,11 +167,11 @@ func (a *API) getOverridesFromBucket(ctx context.Context, userID string) (map[st } // No tenant limits configured - return empty map (no overrides) - return map[string]interface{}{}, nil + return map[string]any{}, nil } // setOverridesToBucket writes overrides for a specific tenant to the runtime config file -func (a *API) setOverridesToBucket(ctx context.Context, userID string, overrides map[string]interface{}) error { +func (a *API) setOverridesToBucket(ctx context.Context, userID string, overrides map[string]any) error { var config runtimeconfig.RuntimeConfigValues reader, err := a.bucketClient.Get(ctx, a.runtimeConfigPath) if err == nil { diff --git a/pkg/overrides/limits.go b/pkg/overrides/limits.go index 499109d5104..84d3b41c8e5 100644 --- a/pkg/overrides/limits.go +++ b/pkg/overrides/limits.go @@ -21,7 +21,7 @@ const ( // No default allowed limits - these must be configured via runtime config // ValidateOverrides checks if the provided overrides only contain allowed limits -func ValidateOverrides(overrides map[string]interface{}, allowedLimits []string) error { +func ValidateOverrides(overrides map[string]any, allowedLimits []string) error { var invalidLimits []string for limitName := range overrides { @@ -38,7 +38,7 @@ func ValidateOverrides(overrides map[string]interface{}, allowedLimits []string) } // validateHardLimits checks if the provided overrides exceed any hard limits from the runtime config -func (a *API) validateHardLimits(overrides map[string]interface{}, userID string) error { +func (a *API) validateHardLimits(overrides map[string]any, userID string) error { // Read the runtime config to get hard limits reader, err := a.bucketClient.Get(context.Background(), a.runtimeConfigPath) if err != nil { @@ -70,7 +70,7 @@ func (a *API) validateHardLimits(overrides map[string]interface{}, userID string return fmt.Errorf("failed to validate hard limits") } - var hardLimitsMap map[string]interface{} + var hardLimitsMap map[string]any if err := yaml.Unmarshal(yamlData, &hardLimitsMap); err != nil { level.Error(a.logger).Log("msg", "failed to unmarshal hard limits", "userID", userID, "err", err) return fmt.Errorf("failed to validate hard limits") @@ -89,7 +89,7 @@ func (a *API) validateHardLimits(overrides map[string]interface{}, userID string } // validateSingleHardLimit validates a single limit against its hard limit -func (a *API) validateSingleHardLimit(limitName string, value, hardLimit interface{}) error { +func (a *API) validateSingleHardLimit(limitName string, value, hardLimit any) error { // Convert both values to float64 for comparison valueFloat, err := convertToFloat64(value) if err != nil { @@ -109,7 +109,7 @@ func (a *API) validateSingleHardLimit(limitName string, value, hardLimit interfa } // convertToFloat64 converts any value to float64 -func convertToFloat64(v interface{}) (float64, error) { +func convertToFloat64(v any) (float64, error) { switch val := v.(type) { case float64: return val, nil diff --git a/pkg/overrides/overrides_test.go b/pkg/overrides/overrides_test.go index a6d5d534935..223fda1eca6 100644 --- a/pkg/overrides/overrides_test.go +++ b/pkg/overrides/overrides_test.go @@ -159,7 +159,7 @@ func TestAPIEndpoints(t *testing.T) { method string path string tenantID string - requestBody interface{} + requestBody any expectedStatus int setupMock func(*bucket.ClientMock) validateResponse func(*testing.T, *httptest.ResponseRecorder) @@ -182,7 +182,7 @@ func TestAPIEndpoints(t *testing.T) { mock.MockGet("runtime.yaml", "overrides:\n", nil) }, validateResponse: func(t *testing.T, recorder *httptest.ResponseRecorder) { - var response map[string]interface{} + var response map[string]any err := json.Unmarshal(recorder.Body.Bytes(), &response) require.NoError(t, err) assert.Empty(t, response) @@ -202,7 +202,7 @@ func TestAPIEndpoints(t *testing.T) { mock.MockGet("runtime.yaml", overridesData, nil) }, validateResponse: func(t *testing.T, recorder *httptest.ResponseRecorder) { - var response map[string]interface{} + var response map[string]any err := json.Unmarshal(recorder.Body.Bytes(), &response) require.NoError(t, err) assert.Equal(t, float64(5000), response["ingestion_rate"]) @@ -228,7 +228,7 @@ func TestAPIEndpoints(t *testing.T) { method: "POST", path: "/api/v1/user-overrides", tenantID: "", - requestBody: map[string]interface{}{"ingestion_rate": 5000}, + requestBody: map[string]any{"ingestion_rate": 5000}, expectedStatus: http.StatusUnauthorized, }, { @@ -236,7 +236,7 @@ func TestAPIEndpoints(t *testing.T) { method: "POST", path: "/api/v1/user-overrides", tenantID: "user789", - requestBody: map[string]interface{}{"ingestion_rate": 5000, "ruler_max_rules_per_rule_group": 10}, + requestBody: map[string]any{"ingestion_rate": 5000, "ruler_max_rules_per_rule_group": 10}, expectedStatus: http.StatusOK, setupMock: func(mock *bucket.ClientMock) { // Mock runtime config with allowed limits @@ -262,7 +262,7 @@ api_allowed_limits: method: "POST", path: "/api/v1/user-overrides", tenantID: "user999", - requestBody: map[string]interface{}{"invalid_limit": 5000}, + requestBody: map[string]any{"invalid_limit": 5000}, expectedStatus: http.StatusBadRequest, setupMock: func(mock *bucket.ClientMock) { // Mock runtime config with allowed limits (invalid_limit not included) @@ -290,7 +290,7 @@ api_allowed_limits: method: "POST", path: "/api/v1/user-overrides", tenantID: "user999", - requestBody: map[string]interface{}{"ingestion_rate": 1500000}, // Exceeds hard limit of 1000000 + requestBody: map[string]any{"ingestion_rate": 1500000}, // Exceeds hard limit of 1000000 expectedStatus: http.StatusBadRequest, setupMock: func(mock *bucket.ClientMock) { // Mock runtime config with per-user hard limits and allowed limits @@ -577,7 +577,7 @@ api_allowed_limits: // Create the request var req *http.Request if tt.method == "POST" { - requestBody := map[string]interface{}{"ingestion_rate": 5000} + requestBody := map[string]any{"ingestion_rate": 5000} body, err := json.Marshal(requestBody) require.NoError(t, err) req = httptest.NewRequest(tt.method, "/api/v1/user-overrides", bytes.NewReader(body))