diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 257ed2bf..30273d2e 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -28,7 +28,7 @@ run_command_with_updates, run_command_with_updates_retry, ) -from .pathways import add_pw_resource_flavors, add_pw_resources_to_kueue +from .pathways import add_pw_resource_flavors from .resources import AutoprovisioningConfig from .scheduling import ( create_accelerator_label, @@ -104,7 +104,6 @@ namespaceSelector: {{}} # match all. resourceGroups: {covered_resources_config} - {pw_resources_kueue} {admission_checks} --- apiVersion: kueue.x-k8s.io/v1beta1 @@ -432,6 +431,7 @@ def install_kueue_crs( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + enable_pathways=args.enable_pathways, ) topology_label = '' if system.device_type in [ @@ -456,7 +456,6 @@ def install_kueue_crs( covered_resources_config=covered_resources_config, resource_type=res_type, pw_resource_flavors=add_pw_resource_flavors(args), - pw_resources_kueue=add_pw_resources_to_kueue(args), admission_checks=admission_checks, managed_resource=res_type, cluster_queue_name=CLUSTER_QUEUE_NAME, @@ -480,7 +479,7 @@ def install_kueue_crs( def get_kueue_covered_resources_config( - cluster_hardware_name, resource_type, total_chips + cluster_hardware_name, resource_type, total_chips, enable_pathways ) -> str: """Gets Kueue covered resources configuration. @@ -488,15 +487,34 @@ def get_kueue_covered_resources_config( cluster_hardware_name: cluster hardware name. resource_type: resource type of tpu or gpu. total_chips: total number of chips for the specific resource type. + enable_pathways: if pathways is enabled. Returns: A string of Kueue covered resources configuration. """ + pathways_resources = '' + if enable_pathways: + pathways_resources = """ + - name: cpu-user + resources: + - name: "cpu" + nominalQuota: 480 + - name: "memory" + nominalQuota: 2000G + - name: "{resource_type}" + nominalQuota: 0 + """.format(resource_type=resource_type) + config_format = """ - - coveredResources: ["{resource_type}"] + - coveredResources: ["cpu", "memory", "{resource_type}"] flavors: +{pathways_resources} - name: {cluster_hardware_name} resources: + - name: "cpu" + nominalQuota: "9999999999" + - name: "memory" + nominalQuota: "99999999999Gi" - name: "{resource_type}" nominalQuota: {total_chips} """ @@ -504,6 +522,7 @@ def get_kueue_covered_resources_config( cluster_hardware_name=cluster_hardware_name, resource_type=resource_type, total_chips=total_chips, + pathways_resources=pathways_resources, ) return config_string diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 81770eb0..771355a0 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -37,19 +37,6 @@ def add_pw_resource_flavors(args): return '' -def add_pw_resources_to_kueue(args): - """Add resource flavors required for Pathways, to the cluster queue.""" - resources_yaml = """- coveredResources: ["cpu", "memory"] - flavors: - - name: cpu-user - resources: - - name: "cpu" - nominalQuota: 480 - - name: "memory" - nominalQuota: 2000G""" - if args.enable_pathways: - return resources_yaml - return '' def ensure_pathways_workload_prerequisites(args, system) -> bool: