From 9a336355fdabe1cdab553fe6a92467feecae8b9d Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Sun, 3 Aug 2025 11:21:44 -0700 Subject: [PATCH 1/2] feat: Pathways use single resource group Jobs requesting TPU resources may also have requests for CPU and memory. However when pathways is enabled, Kueue will not be able to admit such jobs since there is no cpu and memory quota. This fix adds a very high number of CPU and memory for TPU/GPU resources and merges the pathways resource group with the accelerator resource group. This also allows us to run AXLearn jobs without having to make changes manually. --- src/xpk/core/kueue.py | 29 ++++++++++++++++++++++++----- src/xpk/core/pathways.py | 13 ------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 257ed2bf1..181e7821e 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -28,7 +28,7 @@ run_command_with_updates, run_command_with_updates_retry, ) -from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue +from .pathways import add_pw_resource_flavors from .resources import AutoprovisioningConfig from .scheduling import ( create_accelerator_label, @@ -104,7 +104,6 @@ namespaceSelector: {{}} # match all. resourceGroups: {covered_resources_config} - {pw_resources_kueue} {admission_checks} --- apiVersion: kueue.x-k8s.io/v1beta1 @@ -432,6 +431,7 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + enable_pathways=args.enable_pathways, ) topology_label = '' if system.device_type in [ @@ -456,7 +456,6 @@ def install_kueue_crs( covered_resources_config=covered_resources_config, resource_type=res_type, pw_resource_flavors=add_pw_resource_flavors(args), - pw_resources_kueue=add_pw_resources_to_kueue(args), admission_checks=admission_checks, managed_resource=res_type, cluster_queue_name=CLUSTER_QUEUE_NAME, @@ -480,7 +479,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, enable_pathways ) -> str: """Gets Kueue covered resources configuration. @@ -488,22 +487,42 @@ def get_kueue_covered_resources_config( cluster_hardware_name: cluster hardware name. resource_type: resource type of tpu or gpu. total_chips: total number of chips for the specific resource type. + enable_pathways: if pathways is enabled. Returns: A string of Kueue covered resources configuration. """ + pathways_resources = '' + if enable_pathways: + pathways_resources = """ + - name: cpu-user + resources: + - name: "cpu" + nominalQuota: 480 + - name: "memory" + nominalQuota: 2000G + - name: "{resource_type}" + nominalQuota: 0 + """.format(resource_type=resource_type) + config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: ["cpu", "memory", "{resource_type}"] flavors: - name: {cluster_hardware_name} resources: + - name: "cpu" + nominalQuota: "9999999999" + - name: "memory" + nominalQuota: "99999999999Gi" - name: "{resource_type}" nominalQuota: {total_chips} +{pathways_resources} """ config_string = config_format.format( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + pathways_resources=pathways_resources, ) return config_string diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 81770eb04..771355a04 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -37,19 +37,6 @@ def add_pw_resource_flavors(args): return '' -def add_pw_resources_to_kueue(args): - """Add resource flavors required for Pathways, to the cluster queue.""" - resources_yaml = """- coveredResources: ["cpu", "memory"] - flavors: - - name: cpu-user - resources: - - name: "cpu" - nominalQuota: 480 - - name: "memory" - nominalQuota: 2000G""" - if args.enable_pathways: - return resources_yaml - return '' def ensure_pathways_workload_prerequisites(args, system) -> bool: From 899a24a46ba7fe1c73334eea6bf20c6be29c1797 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Mon, 18 Aug 2025 20:09:43 -0700 Subject: [PATCH 2/2] fix the ordering, pathways resources need to come first Otherwise the pathways head pod will not first get assigned to CPU only resource flavors. --- src/xpk/core/kueue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 181e7821e..30273d2e4 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -508,6 +508,7 @@ def get_kueue_covered_resources_config( config_format = """ - coveredResources: ["cpu", "memory", "{resource_type}"] flavors: +{pathways_resources} - name: {cluster_hardware_name} resources: - name: "cpu" @@ -516,7 +517,6 @@ def get_kueue_covered_resources_config( nominalQuota: "99999999999Gi" - name: "{resource_type}" nominalQuota: {total_chips} -{pathways_resources} """ config_string = config_format.format( cluster_hardware_name=cluster_hardware_name,