From e0350aa7c32f0e432d3aeb509a49541c28ff5b03 Mon Sep 17 00:00:00 2001 From: Thibault Normand Date: Tue, 17 Feb 2026 16:08:36 +0100 Subject: [PATCH 1/7] feat(disruption): Memory pressure. --- .gitignore | 1 + CLAUDE.md | 86 ++ api/v1beta1/disruption_types.go | 28 +- api/v1beta1/memory_pressure.go | 78 ++ api/v1beta1/memory_pressure_test.go | 100 ++ api/v1beta1/zz_generated.deepcopy.go | 20 + .../chaos.datadoghq.com_disruptioncrons.yaml | 15 + ...haos.datadoghq.com_disruptionrollouts.yaml | 15 + .../chaos.datadoghq.com_disruptions.yaml | 15 + cli/injector/main.go | 2 + cli/injector/memory_pressure.go | 48 + cli/injector/memory_stress.go | 77 ++ docs/README.md | 1 + docs/disruption_catalogue.md | 1223 +++++++++++++++++ docs/memory_disruption.md | 274 ++++ injector/memory_alloc_linux.go | 21 + injector/memory_alloc_other.go | 21 + injector/memory_pressure.go | 110 ++ injector/memory_pressure_test.go | 102 ++ injector/memory_stress.go | 250 ++++ injector/memory_stress_args_builder_mock.go | 90 ++ safemode/safemode.go | 6 + safemode/safemode_memory.go | 22 + types/types.go | 5 + 24 files changed, 2605 insertions(+), 5 deletions(-) create mode 100644 CLAUDE.md create mode 100644 api/v1beta1/memory_pressure.go create mode 100644 api/v1beta1/memory_pressure_test.go create mode 100644 cli/injector/memory_pressure.go create mode 100644 cli/injector/memory_stress.go create mode 100644 docs/disruption_catalogue.md create mode 100644 docs/memory_disruption.md create mode 100644 injector/memory_alloc_linux.go create mode 100644 injector/memory_alloc_other.go create mode 100644 injector/memory_pressure.go create mode 100644 injector/memory_pressure_test.go create mode 100644 injector/memory_stress.go create mode 100644 injector/memory_stress_args_builder_mock.go create mode 100644 safemode/safemode_memory.go diff --git a/.gitignore b/.gitignore index 46d304079..4e9c68175 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ ebpf/builds/ # DS_Store .DS_Store +.claude diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..3b3da591e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,86 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Kubernetes operator for chaos engineering (Datadog). Injects systemic failures (network, CPU, disk, DNS, gRPC, container/node failure) into Kubernetes clusters at scale. Built with Kubebuilder v3 and controller-runtime. + +## Build Commands + +```bash +make docker-build-all # Build all Docker images (manager, injector, handler) +make manifests # Generate CRDs and RBAC manifests +make generate # Generate Go code (deepcopy, etc.) +make generate-mocks # Regenerate mocks (mockery v2.53.5) +make chaosli # Build CLI helper tool +make godeps # go mod tidy + vendor +make header # Check/fix license headers +``` + +## Testing + +```bash +make test # Run all unit tests (Ginkgo v2) +make test TEST_ARGS="injector" # Filter tests by package name +make test TEST_ARGS="--until-it-fails" # Detect flaky tests +make test GINKGO_PROCS=4 # Control parallelism +make e2e-test # End-to-end tests (requires cluster) +``` + +Tests use **Ginkgo v2** (BDD) with **Gomega** matchers. Coverage output: `cover.profile`. + +## Linting and Formatting + +```bash +make lint # golangci-lint (v2.8.0) +make fmt # Format Go code +make vet # Go vet +make spellcheck # Spell check markdown docs +``` + +## Local Development + +```bash +make lima-all # Start local k3s cluster with controller +make lima-redeploy # Rebuild and redeploy to local cluster +make lima-stop # Stop local cluster +make watch # Auto-rebuild on file changes +make debug # Prepare for IDE debugging +make run # Run controller locally +``` + +## Architecture + +Three main components, each with its own Dockerfile in `bin/`: + +- **Manager** (`main.go`, `controllers/`): Long-running controller pod. Watches Disruption CRDs, selects targets via label selectors, creates chaos pods, manages lifecycle with finalizers. Reconciliation flow: add finalizer → compute spec hash → select targets → create chaos pods → track injection status. +- **Injector** (`injector/`, `cli/injector/`): Runs as ephemeral chaos pods on target nodes. Performs actual disruption using Linux primitives (cgroups, tc, iptables, eBPF). One chaos pod per target per disruption kind. +- **Handler** (`webhook/`, `cli/handler/`): Admission webhook for pod initialization-time network disruptions. + +### CRDs (api/v1beta1/) + +- **Disruption**: Main resource defining what failure to inject and targeting criteria +- **DisruptionCron**: Scheduled/recurring disruptions +- **DisruptionRollout**: Progressive disruption rollout + +### Key Packages + +- `targetselector/` — Target selection logic (labels, count, filters, safety nets) +- `safemode/` — Safety mechanisms to prevent dangerous disruptions +- `eventnotifier/` — Notifications (Slack, Datadog, HTTP) +- `o11y/` — Observability (metrics, tracing, profiling for Datadog and Prometheus) +- `cloudservice/` — Cloud provider integrations +- `ebpf/` — eBPF programs for network disruption +- `grpc/disruptionlistener/` — gRPC service for disruption events +- `chart/` — Helm chart for deployment + +### Code Generation + +CRDs are defined in `api/v1beta1/` with kubebuilder markers. After modifying types, run `make manifests generate`. Mocks are generated with mockery into `mocks/`. Protobuf definitions live in `grpc/` and `dogfood/`. + +## Requirements + +- Kubernetes >= 1.16 (not 1.20.0-1.20.4) +- Go 1.25.6 +- Docker with buildx (multi-arch: amd64, arm64) diff --git a/api/v1beta1/disruption_types.go b/api/v1beta1/disruption_types.go index 248f53c24..ac7a6d342 100644 --- a/api/v1beta1/disruption_types.go +++ b/api/v1beta1/disruption_types.go @@ -76,6 +76,8 @@ type DisruptionSpec struct { // +nullable CPUPressure *CPUPressureSpec `json:"cpuPressure,omitempty"` // +nullable + MemoryPressure *MemoryPressureSpec `json:"memoryPressure,omitempty"` + // +nullable DiskPressure *DiskPressureSpec `json:"diskPressure,omitempty"` // +nullable DiskFailure *DiskFailureSpec `json:"diskFailure,omitempty"` @@ -694,25 +696,25 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re } // Rule: At least one disruption kind must be applied - if s.CPUPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { + if s.CPUPressure == nil && s.MemoryPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { retErr = multierror.Append(retErr, errors.New("at least one disruption kind must be specified, please read the docs to see your options")) } // Rule: ContainerFailure, NodeFailure, and PodReplacement disruptions are not compatible with other failure types if s.ContainerFailure != nil { - if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("container failure disruptions are not compatible with other disruption kinds. The container failure will remove the impact of the other disruption types")) } } if s.NodeFailure != nil { - if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("node failure disruptions are not compatible with other disruption kinds. The node failure will remove the impact of the other disruption types")) } } if s.PodReplacement != nil { - if s.CPUPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { retErr = multierror.Append(retErr, errors.New("pod replacement disruptions are not compatible with other disruption kinds. The pod replacement will remove the impact of the other disruption types")) } // Rule: container failure not possible if disruption is node-level @@ -724,6 +726,7 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re // Rule: on init compatibility if s.OnInit { if s.CPUPressure != nil || + s.MemoryPressure != nil || s.NodeFailure != nil || s.PodReplacement != nil || s.ContainerFailure != nil || @@ -747,6 +750,11 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re retErr = multierror.Append(retErr, errors.New("disk pressure disruptions apply to all containers, specifying certain containers does not isolate the disruption")) } + // Rule: No specificity of containers on a memory disruption + if len(s.Containers) != 0 && s.MemoryPressure != nil { + retErr = multierror.Append(retErr, errors.New("memory pressure disruptions apply to all containers, specifying certain containers does not isolate the disruption")) + } + // Rule: DisruptionTrigger if s.Triggers != nil && !s.Triggers.IsZero() { if !s.Triggers.Inject.IsZero() && !s.Triggers.CreatePods.IsZero() { @@ -772,7 +780,7 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re if s.Pulse != nil { if s.Pulse.ActiveDuration.Duration() > 0 || s.Pulse.DormantDuration.Duration() > 0 { if s.NodeFailure != nil || s.PodReplacement != nil || s.ContainerFailure != nil { - retErr = multierror.Append(retErr, errors.New("pulse is only compatible with network, cpu pressure, disk pressure, dns, and grpc disruptions")) + retErr = multierror.Append(retErr, errors.New("pulse is only compatible with network, cpu pressure, memory pressure, disk pressure, dns, and grpc disruptions")) } } @@ -824,6 +832,8 @@ func (s DisruptionSpec) DisruptionKindPicker(kind chaostypes.DisruptionKindName) disruptionKind = s.Network case chaostypes.DisruptionKindCPUPressure: disruptionKind = s.CPUPressure + case chaostypes.DisruptionKindMemoryPressure: + disruptionKind = s.MemoryPressure case chaostypes.DisruptionKindDiskPressure: disruptionKind = s.DiskPressure case chaostypes.DisruptionKindGRPCDisruption: @@ -888,6 +898,10 @@ func (s DisruptionSpec) DisruptionCount() int { count++ } + if s.MemoryPressure != nil { + count++ + } + if s.ContainerFailure != nil { count++ } @@ -1060,6 +1074,10 @@ func (s DisruptionSpec) Explain() []string { explanation = append(explanation, s.CPUPressure.Explain()...) } + if s.MemoryPressure != nil { + explanation = append(explanation, s.MemoryPressure.Explain()...) + } + if s.DiskPressure != nil { explanation = append(explanation, s.DiskPressure.Explain()...) } diff --git a/api/v1beta1/memory_pressure.go b/api/v1beta1/memory_pressure.go new file mode 100644 index 000000000..6cc6f7779 --- /dev/null +++ b/api/v1beta1/memory_pressure.go @@ -0,0 +1,78 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1 + +import ( + "fmt" + "strconv" + "strings" + + "github.com/hashicorp/go-multierror" +) + +// MemoryPressureSpec represents a memory pressure disruption +type MemoryPressureSpec struct { + // Target memory utilization as a percentage (e.g., "76%") + // +kubebuilder:validation:Required + TargetPercent string `json:"targetPercent" chaos_validate:"required"` + // Duration over which memory is gradually consumed (e.g., "10m") + // If empty, memory is consumed immediately + RampDuration DisruptionDuration `json:"rampDuration,omitempty"` +} + +// Validate validates args for the given disruption +func (s *MemoryPressureSpec) Validate() (retErr error) { + // Rule: targetPercent must be a valid percentage between 1 and 100 + pct, err := ParseTargetPercent(s.TargetPercent) + if err != nil { + retErr = multierror.Append(retErr, fmt.Errorf("invalid targetPercent %q: %w", s.TargetPercent, err)) + } else if pct < 1 || pct > 100 { + retErr = multierror.Append(retErr, fmt.Errorf("targetPercent must be between 1 and 100, got %d", pct)) + } + + // Rule: rampDuration must be non-negative + if s.RampDuration.Duration() < 0 { + retErr = multierror.Append(retErr, fmt.Errorf("rampDuration must be non-negative, got %s", s.RampDuration)) + } + + return retErr +} + +// GenerateArgs generates injection or cleanup pod arguments for the given spec +func (s *MemoryPressureSpec) GenerateArgs() []string { + args := []string{ + "memory-pressure", + "--target-percent", s.TargetPercent, + } + + if s.RampDuration.Duration() > 0 { + args = append(args, "--ramp-duration", s.RampDuration.Duration().String()) + } + + return args +} + +func (s *MemoryPressureSpec) Explain() []string { + pct, _ := ParseTargetPercent(s.TargetPercent) + + explanation := fmt.Sprintf("spec.memoryPressure will cause memory pressure on the target, by joining its cgroup and allocating memory to reach %d%% of the target's memory limit", pct) + + if s.RampDuration.Duration() > 0 { + explanation += fmt.Sprintf(", ramping up over %s.", s.RampDuration.Duration()) + } else { + explanation += " immediately." + } + + return []string{"", explanation} +} + +// ParseTargetPercent parses a percentage string like "76%" or "76" and returns the integer value +func ParseTargetPercent(s string) (int, error) { + s = strings.TrimSpace(s) + s = strings.TrimSuffix(s, "%") + + return strconv.Atoi(s) +} diff --git a/api/v1beta1/memory_pressure_test.go b/api/v1beta1/memory_pressure_test.go new file mode 100644 index 000000000..8124e7ab1 --- /dev/null +++ b/api/v1beta1/memory_pressure_test.go @@ -0,0 +1,100 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1_test + +import ( + . "github.com/DataDog/chaos-controller/api/v1beta1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("MemoryPressureSpec", func() { + Describe("Validate", func() { + It("succeeds with valid percentage", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "76%", + } + Expect(spec.Validate()).To(Succeed()) + }) + + It("succeeds with percentage without suffix", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "50", + } + Expect(spec.Validate()).To(Succeed()) + }) + + It("succeeds with ramp duration", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "76%", + RampDuration: DisruptionDuration("10m"), + } + Expect(spec.Validate()).To(Succeed()) + }) + + It("fails with zero percent", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "0%", + } + Expect(spec.Validate()).To(HaveOccurred()) + }) + + It("fails with percent over 100", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "101%", + } + Expect(spec.Validate()).To(HaveOccurred()) + }) + + It("fails with invalid percent string", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "abc", + } + Expect(spec.Validate()).To(HaveOccurred()) + }) + }) + + Describe("GenerateArgs", func() { + It("generates args without ramp duration", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "76%", + } + args := spec.GenerateArgs() + Expect(args).To(Equal([]string{"memory-pressure", "--target-percent", "76%"})) + }) + + It("generates args with ramp duration", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "50%", + RampDuration: DisruptionDuration("10m0s"), + } + args := spec.GenerateArgs() + Expect(args).To(Equal([]string{"memory-pressure", "--target-percent", "50%", "--ramp-duration", "10m0s"})) + }) + }) + + Describe("Explain", func() { + It("explains without ramp", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "76%", + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("76%")) + Expect(explanation[1]).To(ContainSubstring("immediately")) + }) + + It("explains with ramp", func() { + spec := &MemoryPressureSpec{ + TargetPercent: "50%", + RampDuration: DisruptionDuration("10m0s"), + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("ramping up")) + }) + }) +}) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 79caab059..c9271bd20 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -636,6 +636,11 @@ func (in *DisruptionSpec) DeepCopyInto(out *DisruptionSpec) { *out = new(CPUPressureSpec) (*in).DeepCopyInto(*out) } + if in.MemoryPressure != nil { + in, out := &in.MemoryPressure, &out.MemoryPressure + *out = new(MemoryPressureSpec) + **out = **in + } if in.DiskPressure != nil { in, out := &in.DiskPressure, &out.DiskPressure *out = new(DiskPressureSpec) @@ -836,6 +841,21 @@ func (in HTTPPaths) DeepCopy() HTTPPaths { return *out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MemoryPressureSpec) DeepCopyInto(out *MemoryPressureSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryPressureSpec. +func (in *MemoryPressureSpec) DeepCopy() *MemoryPressureSpec { + if in == nil { + return nil + } + out := new(MemoryPressureSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NetworkDisruptionCloudServiceSpec) DeepCopyInto(out *NetworkDisruptionCloudServiceSpec) { *out = *in diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml index 0dad2365b..14af5850d 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml @@ -309,6 +309,21 @@ spec: After this many runs, the disruption will become idle (default: unlimited for continuous disruptions) minimum: 1 type: integer + memoryPressure: + description: MemoryPressureSpec represents a memory pressure disruption + nullable: true + properties: + rampDuration: + description: |- + Duration over which memory is gradually consumed (e.g., "10m") + If empty, memory is consumed immediately + type: string + targetPercent: + description: Target memory utilization as a percentage (e.g., "76%") + type: string + required: + - targetPercent + type: object network: description: NetworkDisruptionSpec represents a network disruption injection nullable: true diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml index c8bd85305..e9f636d1f 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml @@ -310,6 +310,21 @@ spec: After this many runs, the disruption will become idle (default: unlimited for continuous disruptions) minimum: 1 type: integer + memoryPressure: + description: MemoryPressureSpec represents a memory pressure disruption + nullable: true + properties: + rampDuration: + description: |- + Duration over which memory is gradually consumed (e.g., "10m") + If empty, memory is consumed immediately + type: string + targetPercent: + description: Target memory utilization as a percentage (e.g., "76%") + type: string + required: + - targetPercent + type: object network: description: NetworkDisruptionSpec represents a network disruption injection nullable: true diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml index 2c6bf14f5..fa6b4be1e 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml @@ -300,6 +300,21 @@ spec: After this many runs, the disruption will become idle (default: unlimited for continuous disruptions) minimum: 1 type: integer + memoryPressure: + description: MemoryPressureSpec represents a memory pressure disruption + nullable: true + properties: + rampDuration: + description: |- + Duration over which memory is gradually consumed (e.g., "10m") + If empty, memory is consumed immediately + type: string + targetPercent: + description: Target memory utilization as a percentage (e.g., "76%") + type: string + required: + - targetPercent + type: object network: description: NetworkDisruptionSpec represents a network disruption injection nullable: true diff --git a/cli/injector/main.go b/cli/injector/main.go index b2177c585..69d91f603 100644 --- a/cli/injector/main.go +++ b/cli/injector/main.go @@ -87,6 +87,8 @@ func init() { rootCmd.AddCommand(containerFailureCmd) rootCmd.AddCommand(cpuPressureCmd) rootCmd.AddCommand(cpuPressureStressCmd) + rootCmd.AddCommand(memoryPressureCmd) + rootCmd.AddCommand(memoryPressureStressCmd) rootCmd.AddCommand(diskFailureCmd) rootCmd.AddCommand(diskPressureCmd) rootCmd.AddCommand(grpcDisruptionCmd) diff --git a/cli/injector/memory_pressure.go b/cli/injector/memory_pressure.go new file mode 100644 index 000000000..25051ae9d --- /dev/null +++ b/cli/injector/memory_pressure.go @@ -0,0 +1,48 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package main + +import ( + "time" + + "github.com/DataDog/chaos-controller/command" + "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/process" + "github.com/spf13/cobra" +) + +var memoryPressureCmd = &cobra.Command{ + Use: "memory-pressure", + Short: "Memory pressure subcommands", + Run: injectAndWait, + PreRun: func(cmd *cobra.Command, args []string) { + targetPercent, _ := cmd.Flags().GetString("target-percent") + rampDuration, _ := cmd.Flags().GetDuration("ramp-duration") + + cmdFactory := command.NewFactory(disruptionArgs.DryRun) + processManager := process.NewManager(disruptionArgs.DryRun) + injectorCmdFactory := injector.NewInjectorCmdFactory(log, processManager, cmdFactory) + memoryStressArgsBuilder := memoryStressArgsBuilder{} + + for _, config := range configs { + injectors = append( + injectors, + injector.NewMemoryPressureInjector( + config, + targetPercent, + rampDuration, + injectorCmdFactory, + memoryStressArgsBuilder, + ), + ) + } + }, +} + +func init() { + memoryPressureCmd.Flags().String("target-percent", "", "target memory utilization percentage (e.g., '76%')") + memoryPressureCmd.Flags().Duration("ramp-duration", time.Duration(0), "duration to ramp up memory usage") +} diff --git a/cli/injector/memory_stress.go b/cli/injector/memory_stress.go new file mode 100644 index 000000000..7bcb1118d --- /dev/null +++ b/cli/injector/memory_stress.go @@ -0,0 +1,77 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package main + +import ( + "fmt" + "time" + + "github.com/spf13/cobra" + + "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/process" +) + +const ( + memoryTargetPercentFlagName = "target-percent" + memoryRampDurationFlagName = "ramp-duration" + memoryStressCommandName = "memory-stress" +) + +var memoryPressureStressCmd = &cobra.Command{ + Use: memoryStressCommandName, + Short: "Memory stress subcommands", + Run: injectAndWait, + PreRunE: func(cmd *cobra.Command, args []string) error { + if len(configs) != 1 { + return fmt.Errorf("%s expects a single target configuration, found %d", memoryStressCommandName, len(configs)) + } + + config := configs[0] + + targetPercent, _ := cmd.Flags().GetInt(memoryTargetPercentFlagName) + rampDuration, _ := cmd.Flags().GetDuration(memoryRampDurationFlagName) + + log.Infow("stressing memory allocated to target", + "targetPercent", targetPercent, + "rampDuration", rampDuration, + "targetName", config.TargetName(), + ) + + processManager := process.NewManager(config.Disruption.DryRun) + + injectors = append( + injectors, + injector.NewMemoryStressInjector( + config, + targetPercent, + rampDuration, + processManager, + )) + + return nil + }, +} + +func init() { + memoryPressureStressCmd.Flags().Int(memoryTargetPercentFlagName, 50, "target memory utilization percentage") + memoryPressureStressCmd.Flags().Duration(memoryRampDurationFlagName, time.Duration(0), "duration to ramp up memory usage") +} + +type memoryStressArgsBuilder struct{} + +func (m memoryStressArgsBuilder) GenerateArgs(targetPercent int, rampDuration time.Duration) []string { + args := []string{ + memoryStressCommandName, + fmt.Sprintf("--%s=%d", memoryTargetPercentFlagName, targetPercent), + } + + if rampDuration > 0 { + args = append(args, fmt.Sprintf("--%s=%s", memoryRampDurationFlagName, rampDuration.String())) + } + + return args +} diff --git a/docs/README.md b/docs/README.md index 1ca4f4857..157cbe7ac 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,6 +20,7 @@ This folder contains the detailed documentation of most parts of the project. - [Container Failure](container_disruption.md) - [Node Failure](node_disruption.md) - [CPU Pressure](cpu_pressure.md) + - [Memory Pressure](memory_disruption.md) - [Disk Failure](disk_failure.md) - [Disk Pressure](disk_pressure.md) - [DNS Disruption](dns_disruption.md) diff --git a/docs/disruption_catalogue.md b/docs/disruption_catalogue.md new file mode 100644 index 000000000..547fed4c8 --- /dev/null +++ b/docs/disruption_catalogue.md @@ -0,0 +1,1223 @@ +# Chaos Controller — Disruption Catalogue + +> Reference documentation for all fault injection capabilities provided by [Datadog Chaos Controller](https://github.com/DataDog/chaos-controller). +> This catalogue is designed for experiment planning and can be consumed by humans and AI agents alike. + +--- + +## Quick Reference + +| Disruption | What It Does | Level | Reversible | Can Combine | +| ----------------------------------------- | -------------------------------------------------- | --------- | ---------- | -------------- | +| [Network](#1-network-disruption) | Packet loss, delay, corruption, bandwidth limiting | Pod, Node | Yes | Yes | +| [DNS](#2-dns-disruption) | Fake DNS responses, NXDOMAIN, drops, SERVFAIL | Pod | Yes | Yes | +| [gRPC](#3-grpc-disruption) | Return gRPC errors or override responses | Pod | Yes | Yes | +| [CPU Pressure](#4-cpu-pressure) | Consume CPU cycles in target cgroup | Pod, Node | Yes | Yes | +| [Memory Pressure](#5-memory-pressure) | Gradually consume memory in target cgroup | Pod, Node | Yes | Yes | +| [Disk Pressure](#6-disk-pressure) | Throttle read/write I/O throughput | Pod, Node | Yes | Yes | +| [Disk Failure](#7-disk-failure) | Fail file open syscalls via eBPF | Pod, Node | Yes* | Yes | +| [Container Failure](#8-container-failure) | Kill container processes (SIGTERM/SIGKILL) | Pod | No | No (exclusive) | +| [Node Failure](#9-node-failure) | Kernel panic or power-off a node | Node | No | No (exclusive) | +| [Pod Replacement](#10-pod-replacement) | Cordon node, delete pod and optionally PVCs | Pod | No | No (exclusive) | + +\* Disk Failure injection is removed when the injector process exits. + +**Combination rule:** Network, DNS, gRPC, CPU Pressure, Memory Pressure, Disk Pressure, and Disk Failure can all be applied together in a single Disruption resource. Container Failure, Node Failure, and Pod Replacement are mutually exclusive with every other disruption type. + +--- + +## Common Structure + +Every disruption is a Kubernetes custom resource of kind `Disruption` (API group `chaos.datadoghq.com/v1beta1`). The common envelope looks like this: + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: + namespace: +spec: + level: pod # "pod" or "node" + selector: # label key-value pairs to match targets +