diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index dce3194c..f1455b94 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -787,7 +787,7 @@ def scale_down_deployment( if return_code != 0: xpk_print(f'Scale down {deployment_name} error {return_code}') xpk_exit(return_code) - xpk_print(f'\n{deployment_name} has been scaled down.') + xpk_print(f'{deployment_name} has been scaled down.') def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'): diff --git a/src/xpk/core/cluster.py b/src/xpk/core/cluster.py index 72e830d5..138e9699 100644 --- a/src/xpk/core/cluster.py +++ b/src/xpk/core/cluster.py @@ -27,6 +27,7 @@ run_command_for_value, run_command_with_updates, run_command_with_updates_retry, + run_command_and_capture_output, ) from .gcloud_context import ( add_zone_and_project, @@ -62,8 +63,9 @@ def set_jobset_on_cluster(args) -> int: 0 if successful and 1 otherwise. """ command = ( - 'kubectl apply --server-side --force-conflicts' - f' -f https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml' + 'kubectl apply --server-side -f' + f' https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml' + ' --force-conflicts' ) task = f'Install Jobset on {args.cluster}' return_code = run_command_with_updates_retry(command, task, args) @@ -878,7 +880,54 @@ def update_cluster_with_gcsfuse_driver_if_necessary(args) -> int: return 0 -def get_cluster_credentials(args) -> None: +def test_and_retry_credentials_with_dns_logic(args) -> int: + """Tests kubectl credentials and retries with default settings if a DNS error is found. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if credentials are valid after retrying, 1 otherwise. + """ + + xpk_print('Testing credentials with kubectl...') + kubectl_command = 'kubectl get pods' + kubectl_output, kubectl_return_code = run_command_and_capture_output( + kubectl_command, 'kubectl get pods', args + ) + xpk_print(kubectl_output) + if kubectl_return_code == 0: + xpk_print('Credentials test succeeded.') + return 0 + + dns_endpoint_error = ( + 'control_plane_endpoints_config.dns_endpoint_config.allow_external_traffic' + ' is disabled' + ) + if dns_endpoint_error not in kubectl_output: + xpk_print(f'kubectl failed with an unhandled error: {kubectl_output}') + xpk_exit(kubectl_return_code) + xpk_print( + 'Detected DNS endpoint-related error. Retrying without --dns-endpoint' + ' flag...' + ) + without_dns_command = ( + 'gcloud container clusters get-credentials' + f' {args.cluster} --region={zone_to_region(args.zone)}' + f' --project={args.project} &&' + ' kubectl config view && kubectl config set-context --current' + ' --namespace=default' + ) + return_code = run_command_with_updates_retry( + without_dns_command, 'get-credentials to cluster', args, verbose=False + ) + if return_code != 0: + xpk_print('Failed to get credentials even without --dns-endpoint. Exiting.') + xpk_exit(return_code) + return 0 + + +def get_cluster_credentials(args) -> int: """Run cluster configuration command to set the kubectl config. Args: @@ -890,14 +939,17 @@ def get_cluster_credentials(args) -> None: command = ( 'gcloud container clusters get-credentials' f' {args.cluster} --region={zone_to_region(args.zone)}' + ' --dns-endpoint' f' --project={args.project} &&' ' kubectl config view && kubectl config set-context --current' ' --namespace=default' ) - task = f'get-credentials to cluster {args.cluster}' + task = f'get-credentials-dns-endpoint to cluster {args.cluster}' return_code = run_command_with_updates_retry( command, task, args, verbose=False ) - if return_code != 0: - xpk_print(f'{task} returned ERROR {return_code}') - xpk_exit(return_code) + if return_code == 0: + return_code = test_and_retry_credentials_with_dns_logic(args) + xpk_print('Finished get-credentials and kubectl setup.') + + return return_code diff --git a/src/xpk/core/commands.py b/src/xpk/core/commands.py index ad01e6c2..51faf70c 100644 --- a/src/xpk/core/commands.py +++ b/src/xpk/core/commands.py @@ -354,3 +354,36 @@ def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int: command = f'kubectl apply -f {str(tmp.file.name)}' err_code = run_command_with_updates(command, task, args) return err_code + + +def run_command_and_capture_output( + command: str, task, global_args +) -> tuple[str, int]: + """Executes a command and captures its output and return code. + + Args: + command (str): The command string to execute. + + Returns: + tuple[int, str]: A tuple containing the return code and the captured output string. + """ + if global_args.dry_run: + xpk_print( + f'Task: `{task}` is implemented by the following command' + ' not running since it is a dry run.' + f' \n{command}' + ) + return 0 + try: + result = subprocess.run( + command, shell=True, capture_output=True, text=True, check=False + ) + output = result.stdout + result.stderr + return output, result.returncode + except subprocess.CalledProcessError as e: + error_output = e.stdout + e.stderr + xpk_print(f'Task {task} failed with return code {e.returncode}') + xpk_print('*' * 80) + xpk_print(error_output) + xpk_print('*' * 80) + return error_output, e.returncode diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 7821b23d..edcf0986 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -382,7 +382,7 @@ def wait_for_kueue_available(args: Namespace) -> int: 0 if successful and 1 otherwise. """ command = ( - 'kubectl wait deploy/kueue-controller-manager -nkueue-system' + 'kubectl wait deploy/kueue-controller-manager -n kueue-system' f' --for=condition=available --timeout={WAIT_FOR_KUEUE_TIMEOUT}' ) task = 'Wait for Kueue to be available'