From b3c82a90082cd46e696b84f7031115d3cc0d5a34 Mon Sep 17 00:00:00 2001
From: Cyclinder Kuo <kuocyclinder@gmail.com>
Date: Wed, 12 Feb 2025 22:33:26 +0800
Subject: [PATCH] kep-4622: promote topologyMnagaer policy:
 max-allowable-numa-nodes to GA

Signed-off-by: Cyclinder Kuo <kuocyclinder@gmail.com>
---
 keps/prod-readiness/sig-node/4622.yaml        |   2 +
 .../README.md                                 | 233 ++++++++++++------
 .../kep.yaml                                  |  23 +-
 3 files changed, 168 insertions(+), 90 deletions(-)

diff --git a/keps/prod-readiness/sig-node/4622.yaml b/keps/prod-readiness/sig-node/4622.yaml
index 7fbe865efff..e3c839966dd 100644
--- a/keps/prod-readiness/sig-node/4622.yaml
+++ b/keps/prod-readiness/sig-node/4622.yaml
@@ -1,3 +1,5 @@
 kep-number: 4622
 beta:
   approver: "@jpbetz"
+stable:
+  approver: "@jpbetz"
diff --git a/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/README.md b/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/README.md
index 013fe9364f7..904eb6587c4 100644
--- a/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/README.md
+++ b/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/README.md
@@ -19,8 +19,10 @@ tags, and then generate with `hack/update-toc.sh`.
 <!-- toc -->
 - [Release Signoff Checklist](#release-signoff-checklist)
 - [Summary](#summary)
-- [Goals](#goals)
-- [Non-Goals](#non-goals)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
   - [User Stories (Optional)](#user-stories-optional)
     - [Story 1](#story-1)
     - [Story 2](#story-2)
@@ -69,20 +71,20 @@ checklist items _must_ be updated for the enhancement to be released.
 
 Items marked with (R) are required *prior to targeting to a milestone / release*.
 
-- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
-- [ ] (R) KEP approvers have approved the KEP status as `implementable`
-- [ ] (R) Design details are appropriately documented
-- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
-  - [ ] e2e Tests for all Beta API Operations (endpoints)
-  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
-  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
-- [ ] (R) Graduation criteria is in place
-  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
-- [ ] (R) Production readiness review completed
-- [ ] (R) Production readiness review approved
-- [ ] "Implementation History" section is up-to-date for milestone
-- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
-- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+- [X] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [X] (R) KEP approvers have approved the KEP status as `implementable`
+- [X] (R) Design details are appropriately documented
+- [X] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [X] e2e Tests for all Beta API Operations (endpoints)
+  - [] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [X] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [X] (R) Graduation criteria is in place
+  - [X] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [X] (R) Production readiness review completed
+- [X] (R) Production readiness review approved
+- [X] "Implementation History" section is up-to-date for milestone
+- [X] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [X] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
 
 <!--
 **Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
@@ -97,17 +99,40 @@ Items marked with (R) are required *prior to targeting to a milestone / release*
 
 In this KEP, we propose a new TopologyManager Policy Option called `max-allowable-numa-nodes` to configure the value of maxAllowableNUMANodes in the TopologyManager. The current hard-coded value of 8 was added as a stop-gap 4 years ago to mitigate the state explosion that occurs when trying to enumerate the possible NUMA affinities and generating their hints. By making this setting configurable, we give users the ability to increase this limit when appropriate.
 
-## Goals
+## Motivation
+
+<!--
+This section is for explicitly listing the motivation, goals, and non-goals of
+this KEP.  Describe why the change is important and the benefits to users. The
+motivation section can optionally provide links to [experience reports] to
+demonstrate the interest in a KEP within the wider Kubernetes community.
+
+[experience reports]: https://github.com/golang/go/wiki/ExperienceReports
+-->
+
+### Goals
 
 - This proposal does not aim to modify the existing TopologyManager Policies. It focuses solely on introducing a new policy option to let users configure the maximum supported number of NUMA nodes.
 - Support high-end CPUs with more than 8 NUMA nodes.
 
-## Non-Goals
+### Non-Goals
 
 - It does not address other resource allocation or management aspects within Kubernetes.
 - It does not attempt to remove the state explosion that still exists in the TopologyManager.
 
+## Proposal
+
+<!--
+This is where we get down to the specifics of what the proposal actually is.
+This should have enough detail that reviewers can understand exactly what
+you're proposing, but should not include things like API designs or
+implementation. What is the desired outcome and how do we measure success?.
+The "Design Details" section below is for the real
+nitty-gritty.
+-->
+
 ### User Stories (Optional)
+
 #### Story 1
 
 As a developer in the AI space, I want to use AI accelerators "super chips" which expose ARM cores with more than 8 NUMA nodes.  
@@ -115,7 +140,7 @@ As a developer in the AI space, I want to use AI accelerators "super chips" whic
 #### Story 2
 
 As a user in the high-performance-computing space, I want to enable the sub-NUMA or NUMA-per-socket option of my high-end x86 CPU, which will bring  
-the count of NUMA nodes to exceed 8.  
+the count of NUMA nodes to exceed 8.
 
 #### Story 3
 
@@ -139,33 +164,22 @@ The risk associated with implementing this new proposal is minimal. It pertains
 Users can configure the value of maxAllowableNUMANodes in the TopologyManager when the kubelet starts up, It will fail and abort if the user sets the value is lower than the current hardcoded default (8).
 
 ```go
-		case MaxAllowableNUMANodes:
-			optValue, err := strconv.Atoi(value)
-			if err != nil {
-				return opts, fmt.Errorf("bad value for option %q: %w", name, err)
-			}
-			opts.MaxAllowableNUMANodes = optValue
+  case MaxAllowableNUMANodes:
+   optValue, err := strconv.Atoi(value)
+   if err != nil {
+    return opts, fmt.Errorf("bad value for option %q: %w", name, err)
+   }
+   opts.MaxAllowableNUMANodes = optValue
       ...
 
-	if opts.MaxAllowableNUMANodes < defaultMaxAllowableNUMANodes {
-		return opts, fmt.Errorf("value for option %q is lower than defaultMaxAllowableNUMANodes: %d", MaxAllowableNUMANodes, opts.MaxAllowableNUMANodes)
-	}
+  if opts.MaxAllowableNUMANodes < defaultMaxAllowableNUMANodes {
+    return opts, fmt.Errorf("value for option %q is lower than defaultMaxAllowableNUMANodes: %d", MaxAllowableNUMANodes, opts.MaxAllowableNUMANodes)
+  }
 ```
 
 ### Test Plan
 
-<!--
-**Note:** *Not required until targeted at a release.*
-The goal is to ensure that we don't accept enhancements with inadequate testing.
-
-All code is expected to have adequate tests (eventually with coverage
-expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
-when drafting this test plan.
-
-[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
--->
-
-[x] I/we understand the owners of the involved components may require updates to
+[X] I/we understand the owners of the involved components may require updates to
 existing tests to make this code solid enough prior to committing the changes necessary
 to implement this enhancement.
 
@@ -202,18 +216,25 @@ extending the production code to implement this enhancement.
 ##### Integration tests
 
 <!--
-Integration tests are contained in k8s.io/kubernetes/test/integration.
+Integration tests are contained in https://git.k8s.io/kubernetes/test/integration.
 Integration tests allow control of the configuration parameters used to start the binaries under test.
 This is different from e2e tests which do not allow configuration of parameters.
 Doing this allows testing non-default options and multiple different and potentially conflicting command line options.
+For more details, see https://github.com/kubernetes/community/blob/master/contributors/devel/sig-testing/testing-strategy.md
+
+If integration tests are not necessary or useful, explain why.
 -->
 
 <!--
 This question should be filled when targeting a release.
 For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
 
-For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
-https://storage.googleapis.com/k8s-triage/index.html
+For Beta and GA, document that tests have been written,
+have been executed regularly, and have been stable.
+This can be done with:
+- permalinks to the GitHub source code
+- links to the periodic job (typically https://testgrid.k8s.io/sig-release-master-blocking#integration-master), filtered by the test name
+- a search in the Kubernetes bug triage tool (https://storage.googleapis.com/k8s-triage/index.html)
 -->
 
 No new integration tests for kubelet are planned.
@@ -224,20 +245,21 @@ No new integration tests for kubelet are planned.
 This question should be filled when targeting a release.
 For Alpha, describe what tests will be added to ensure proper quality of the enhancement.
 
-For Beta and GA, add links to added tests together with links to k8s-triage for those tests:
-https://storage.googleapis.com/k8s-triage/index.html
+For Beta and GA, document that tests have been written,
+have been executed regularly, and have been stable.
+This can be done with:
+- permalinks to the GitHub source code
+- links to the periodic job (typically a job owned by the SIG responsible for the feature), filtered by the test name
+- a search in the Kubernetes bug triage tool (https://storage.googleapis.com/k8s-triage/index.html)
 
 We expect no non-infra related flakes in the last month as a GA graduation criteria.
+If e2e tests are not necessary or useful, explain why.
 -->
 
 For beta:
 
 - Verify the input validation with the existing e2e tests(e.g. 9 or 10 or something bigger than the current default but not "too big")
 
-For GA:
-
-- degrading the node and checking the node is reported as degraded
-
 ### Graduation Criteria
 
 #### Beta
@@ -249,7 +271,7 @@ For GA:
 
 #### GA
 
-- Add a metrics: `kubelet_topology_manager_admission_time`.
+- An existing metric: `topology_manager_admission_duration_ms` can be used.
 
 ### Upgrade / Downgrade Strategy
 
@@ -269,7 +291,6 @@ We anticipate no repercussions. The new policy option is voluntary and operates
 
 ### Version Skew Strategy
 
-No changes needed.
 <!--
 If applicable, how will the component handle version skew with other
 components? What are the guarantees? Make sure this is in the test plan.
@@ -283,6 +304,8 @@ enhancement:
   CRI or CNI may require updating that component before the kubelet.
 -->
 
+No changes needed.
+
 ## Production Readiness Review Questionnaire
 
 <!--
@@ -312,16 +335,24 @@ you need any help or guidance.
 <!--
 This section must be completed when targeting alpha to a release.
 -->
+
 1.31:
+
 - enable by default
 - allow gate to disable the feature
 - release note
 
-1.32:
+1.34:
+
 - promote to GA
-- cannot be disabled
+- LockToDefault: true (cannot be disabled)
 - release note
 
+1.35:
+
+- feature gate removed
+
+
 ###### How can this feature be enabled / disabled in a live cluster?
 
 <!--
@@ -334,16 +365,18 @@ well as the [existing list] of feature gates.
 [existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
 -->
 
-- [x] Feature gate (also fill in values in `kep.yaml`)
-  - Feature gate name: 
-    - `TopologyManagerPolicyBetaOptions`
-    - `TopologyManagerPolicyOptions`
+- [X] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: `TopologyManagerPolicyBetaOptions`
   - Components depending on the feature gate: `kubelet`
-- [x] Change the kubelet configuration to set a TopologyManager policy of static and a TopologyManager policy option of `max-allowable-numa-nodes`
-  - Will enabling / disabling the feature require downtime of the control plane? 
+- [X] Other
+  - Describe the mechanism: Change the kubelet configuration to set a TopologyManager policy of static and a TopologyManager policy option of `max-allowable-numa-nodes`
+  - Will enabling / disabling the feature require downtime of the control
+    plane?
     No
-  - Will enabling / disabling the feature require downtime or reprovisioning of a node? (Do not assume Dynamic Kubelet Config feature is enabled).
-    Yes -- kubelet restart is required.
+  - Will enabling / disabling the feature require downtime or reprovisioning
+    of a node?
+    Yes, Kubelet restart is required.
+
 ###### Does enabling the feature change any default behavior?
 
 <!--
@@ -351,7 +384,7 @@ Any change of default behavior may be surprising to users or break existing
 automations, so be extremely careful here.
 -->
 
-No. 
+No.
 
 ###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
 
@@ -374,8 +407,6 @@ Running containers won't be affected by the rollback of the feature, only newly
 
 ###### Are there any tests for feature enablement/disablement?
 
-This new `TopologyManager` policy option will start immediately from beta stage. The unit tests will test whether the configured value of `max-allowable-numa-nodes` is as expected and whether it is the default recommended value when it is not configured.
-
 <!--
 The e2e framework does not currently support enabling or disabling feature
 gates. However, unit tests in each component dealing with managing data, created
@@ -389,13 +420,18 @@ You can take a look at one potential example of such test in:
 https://github.com/kubernetes/kubernetes/pull/97058/files#diff-7826f7adbc1996a05ab52e3f5f02429e94b68ce6bce0dc534d1be636154fded3R246-R282
 -->
 
+This new `TopologyManager` policy option will start immediately from beta stage. The unit tests will test whether the configured value of `max-allowable-numa-nodes` is as expected and whether it is the default recommended value when it is not configured.
+
 ### Rollout, Upgrade and Rollback Planning
 
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
 When feature a is not enabled or configured, its value is the default value. and the feature is fully contained in the kubelet, has no dependencies and rollback and upgrades both will affect only newly created pods.
 
-###### How can a rollout or rollback fail? Can it impact already running workloads?
 
-Rollout or rollout fail do not impact already running workloads, only impact the new workloads.
+###### How can a rollout or rollback fail? Can it impact already running workloads?
 
 <!--
 Try to be as paranoid as possible - e.g., what if some components will restart
@@ -407,6 +443,17 @@ rollout. Similarly, consider large clusters and how enablement/disablement
 will rollout across nodes.
 -->
 
+This feature has specific hardware dependencies that make rollout considerations unique:
+
+1. This feature is only relevant for machines with more than 8 NUMA nodes AND when using a TopologyManager policy other than 'None'.
+
+2. For such hardware configurations, removing this option (rollback) could prevent the kubelet from starting if the system has more NUMA nodes than the default limit allows.
+
+3. For clusters with standard hardware (8 or fewer NUMA nodes), rollout or rollback has no impact as the default behavior remains unchanged.
+
+4. For already running workloads, there is no impact during rollout or rollback - only new workloads will be affected by changes to this setting.
+
+
 ###### What specific metrics should inform a rollback?
 
 <!--
@@ -414,7 +461,8 @@ What signals should users be paying attention to when the feature is young
 that might indicate a serious problem?
 -->
 
-We have a metric which records the topology manager admission time: `kubelet_topology_manager_admission_time`.
+We have an existing metric which records the topology manager admission time: `topology_manager_admission_duration_ms`.
+
 
 ###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
 
@@ -431,6 +479,7 @@ Rollout or upgrade do not impact already running workloads. We plan to add an e2
 <!--
 Even if applying deprecation policies, they may still surprise some users.
 -->
+
 No.
 
 ### Monitoring Requirements
@@ -441,7 +490,9 @@ This section must be completed when targeting beta to a release.
 For GA, this section is required: approvers should be able to confirm the
 previous answers based on experience in the field.
 -->
-We add a metric: `kubelet_topology_manager_admission_time` for kubelet, which can be used to check if the setting is causing unacceptable performance drops.
+
+An existing metric: `topology_manager_admission_duration_ms` for kubelet  can be used to check if the setting is causing unacceptable performance drops.
+
 
 ###### How can an operator determine if the feature is in use by workloads?
 
@@ -473,7 +524,7 @@ Recall that end users cannot usually observe component logs or access metrics.
 - [ ] API .status
   - Condition name: 
   - Other field: 
-- [x] Other (treat as last resort)
+- [X] Other (treat as last resort)
   - Details: If their system has more than 8 NUMA nodes, the TopologyManager is turned on and the kubelet is not crashing, then the feature is working.
 
 ###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
@@ -501,9 +552,12 @@ The value of max-allowable-numa-nodes does not (in and of itself) affect the lat
 Pick one more of these and delete the rest.
 -->
 
-- [ ] Metrics
-  - Metric name: kubelet_topology_manager_admission_time
+- [X] Metrics
+  - Metric name: `topology_manager_admission_duration_ms`
+  - [Optional] Aggregation method:
   - Components exposing the metric: kubelet
+- [ ] Other (treat as last resort)
+  - Details:
 
 ###### Are there any missing metrics that would be useful to have to improve observability of this feature?
 
@@ -524,8 +578,6 @@ kubectl get --raw "/api/v1/nodes/<nodename>/proxy/configz" | jq '.kubeletconfig.
 This section must be completed when targeting beta to a release.
 -->
 
-N/A
-
 ###### Does this feature depend on any specific services running in the cluster?
 
 <!--
@@ -571,8 +623,8 @@ Focusing mostly on:
   - periodic API calls to reconcile state (e.g. periodic fetching state,
     heartbeats, leader election, etc.)
 -->
-No
 
+No.
 ###### Will enabling / using this feature result in introducing new API types?
 
 <!--
@@ -581,7 +633,6 @@ Describe them, providing:
   - Supported number of objects per cluster
   - Supported number of objects per namespace (for namespace-scoped objects)
 -->
-No
 
 ###### Will enabling / using this feature result in any new calls to the cloud provider?
 
@@ -590,7 +641,8 @@ Describe them, providing:
   - Which API(s):
   - Estimated increase:
 -->
-No
+
+No.
 
 ###### Will enabling / using this feature result in increasing size or count of the existing API objects?
 
@@ -600,7 +652,8 @@ Describe them, providing:
   - Estimated increase in size: (e.g., new annotation of size 32B)
   - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
 -->
-No
+
+No.
 
 ###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
 
@@ -658,15 +711,29 @@ details). For now, we leave it here.
 -->
 
 ###### How does this feature react if the API server and/or etcd is unavailable?
+
 N/A
 
 ###### What are other known failure modes?
 
-Setting a value lower 8 causes kubelet crash.
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+Keeping the default value will cause the kubelet to fail to start on machines with 9 or more NUMA cells if any but the `none` topology manager policy is also configured. on machines with 9 or more NUMA cells if any but the `none` topology manager policy is also configured.
 
 ###### What steps should be taken if SLOs are not being met to determine the problem?
 
- As a cluster administrator you should know the number of NUMA nodes on your nodes and adjust the value of the kubelet's topologyManager options or turn it off.
+As a cluster administrator you should know the number of NUMA nodes on your nodes and adjust the value of the kubelet's topologyManager options or turn it off.
 
 ## Implementation History
 
@@ -683,9 +750,14 @@ Major milestones might include:
 
 - 2024-05-08 - initial KEP draft created
 - 2024-06-06 - updates per review feedback
+- 2025-06-08 - promote it to GA
 
 ## Drawbacks
 
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
 - increased kubelet's CPU/Memory usage time
 - increase in pod start time
 
@@ -702,8 +774,9 @@ information to express the idea and why it was not acceptable.
 
 Adding a new kubelet configuration option.
 
-<!--
 ## Infrastructure Needed (Optional)
+
+<!--
 Use this section if you need things from the project/SIG. Examples include a
 new subproject, repos requested, or GitHub details. Listing these here allows a
 SIG to get the process for these resources started right away.
diff --git a/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/kep.yaml b/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/kep.yaml
index 39c50f56bac..418b44e8101 100644
--- a/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/kep.yaml
+++ b/keps/sig-node/4622-topologymanager-max-allowable-numa-nodes/kep.yaml
@@ -5,34 +5,37 @@ authors:
 owning-sig: sig-node
 participating-sigs: []
 status: implementable
-creation-date: "2024-05-08"
+creation-date: "2025-02-15"
 reviewers:
   - "@klueska"
   - "@ffromani"
 approvers:
-  - "@sig-node-tech-leads"
-see-also: []
+  - "@klueska"
+see-also: 
+  - "keps/sig-node/2625-cpumanager-policies-thread-placement/"
+  - "keps/sig-node/2902-cpumanager-distribute-cpus-policy-option/"
+  - "keps/sig-node/3545-improved-multi-numa-alignment/"
+  - "keps/sig-node/4176-cpumanager-spread-cpus-preferred-policy/"
+  - "keps/sig-node/4540-strict-cpu-reservation"
+  - "keps/sig-node/4800-cpumanager-split-uncorecache/"
 replaces: []
 
 # The target maturity stage in the current dev cycle for this KEP.
-stage: beta
+stage: stable
 
 # The most recent milestone for which work toward delivery of this KEP has been
 # done. This can be the current (upcoming) milestone, if it is being actively
 # worked on.
-latest-milestone: "v1.31"
+latest-milestone: "v1.34"
 
 # The milestone at which this feature was, or is targeted to be, at each stage.
 milestone:
   beta: "v1.31"
-  stable: "v1.32"
+  stable: "v1.34"
 
 # The following PRR answers are required at alpha release
 # List the feature gate name and the components for which it must be enabled
 feature-gates:
-  - name: "TopologyManagerPolicyBetaOptions"
-    components:
-      - kubelet
   - name: "TopologyManagerPolicyOptions"
     components:
       - kubelet
@@ -40,4 +43,4 @@ disable-supported: true
 
 # The following PRR answers are required at beta release
 metrics: 
-  - kubelet_topology_manager_admission_time
+  - topology_manager_admission_duration_ms