From 21cbe4d7a91976899b0d983946ec1b62f04a4cd7 Mon Sep 17 00:00:00 2001 From: Eamon O'Reilly Date: Mon, 26 Jan 2026 15:28:51 -0800 Subject: [PATCH 1/2] [Cognitiveservices] Add console log streaming: `az cognitiveservices agent logs show` Add console log streaming commands and options for hosted agents: - Add new `az cognitiveservices agent logs show` command to stream console output (stdout/stderr) or system events from agent containers - Add --show-logs flag to `az cognitiveservices agent create` to stream logs during deployment for troubleshooting - Add --show-logs and --timeout flags to `az cognitiveservices agent start` to stream logs during startup Implementation details: - Implement _stream_agent_logs() using HTTP streaming with Bearer auth - Create _BackgroundLogStreamer context manager for reusable log streaming - Add retry logic to handle container startup delays - Extract helper functions for auth and URL building - Add unit tests for new functionality --- .../cognitiveservices/_help.py | 40 +++ .../cognitiveservices/_params.py | 71 ++++ .../cognitiveservices/commands.py | 4 + .../cognitiveservices/custom.py | 323 ++++++++++++++++-- .../tests/latest/test_agent.py | 227 ++++++++++++ 5 files changed, 634 insertions(+), 31 deletions(-) diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/_help.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/_help.py index 59f09be6d5d..6eacc50580c 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/_help.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/_help.py @@ -635,6 +635,14 @@ --name my-agent \\ --image myregistry.azurecr.io/my-large-agent:v1.0 \\ --timeout 1200 + - name: Create agent and stream container logs during deployment + text: | + az cognitiveservices agent create \\ + --account-name myAccount \\ + --project-name myProject \\ + --name my-agent \\ + --image myregistry.azurecr.io/my-agent:v1.0 \\ + --show-logs """ helps[ @@ -642,9 +650,41 @@ ] = """ type: command short-summary: Start a hosted agent deployment. +long-summary: | + Starts a previously stopped agent deployment. Use --show-logs to stream + container console logs during startup for troubleshooting. examples: - name: Start hosted agent deployment. text: az cognitiveservices agent start --account-name myAccount --project-name myProject --name myAgent --agent-version 1 + - name: Start agent and stream console logs during startup. + text: az cognitiveservices agent start --account-name myAccount --project-name myProject --name myAgent --agent-version 1 --show-logs +""" + +helps[ + "cognitiveservices agent logs" +] = """ +type: group +short-summary: Manage hosted agent container logs. +""" + +helps[ + "cognitiveservices agent logs show" +] = """ +type: command +short-summary: Show logs from a hosted agent container. +long-summary: | + Streams console output (stdout/stderr) or system events from an agent container. + Use --follow to stream logs in real-time, or omit it to fetch recent logs and exit. + This is useful for troubleshooting agent startup issues or monitoring agent behavior. +examples: + - name: Fetch the last 50 lines of console logs from an agent. + text: az cognitiveservices agent logs show --account-name myAccount --project-name myProject --name myAgent --agent-version 1 + - name: Stream console logs in real-time. + text: az cognitiveservices agent logs show --account-name myAccount --project-name myProject --name myAgent --agent-version 1 --follow + - name: Fetch the last 100 lines of system event logs. + text: az cognitiveservices agent logs show --account-name myAccount --project-name myProject --name myAgent --agent-version 1 --type system --tail 100 + - name: Stream logs with custom tail size. + text: az cognitiveservices agent logs show --account-name myAccount --project-name myProject --name myAgent --agent-version 1 --follow --tail 200 """ helps[ diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/_params.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/_params.py index efc3cf77da0..6edf03f47b2 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/_params.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/_params.py @@ -511,6 +511,16 @@ def load_arguments(self, _): ), default=600 ) + c.argument( + 'show_logs', + options_list=['--show-logs'], + action='store_true', + help=( + 'Stream container console logs during deployment. ' + 'Shows real-time output from the agent container as it starts up. ' + 'Useful for debugging startup issues.' + ) + ) with self.argument_context("cognitiveservices agent update") as c: c.argument( @@ -533,6 +543,67 @@ def load_arguments(self, _): help="Cognitive Services hosted agent version. If not provided, deletes all versions.", required=False, ) + + with self.argument_context("cognitiveservices agent start") as c: + c.argument( + 'show_logs', + options_list=['--show-logs'], + action='store_true', + help=( + 'Stream container console logs during startup. ' + 'Shows real-time output from the agent container as it starts. ' + 'Useful for debugging startup issues.' + ) + ) + c.argument( + 'timeout', + type=int, + help=( + 'Maximum time in seconds to wait for deployment to be ready. ' + 'Default: 600 seconds (10 minutes).' + ), + default=600 + ) + + with self.argument_context("cognitiveservices agent logs") as c: + c.argument( + "account_name", + options_list=["--account-name", "-a"], + help="Cognitive service account name." + ) + c.argument( + "project_name", + options_list=["--project-name", "-p"], + help="AI project name" + ) + c.argument( + "agent_name", + options_list=["--name", "-n"], + help="Cognitive Services hosted agent name", + ) + c.argument("agent_version", help="Cognitive Services hosted agent version") + + with self.argument_context("cognitiveservices agent logs show") as c: + c.argument( + 'kind', + options_list=['--type', '-t'], + help="Type of logs to stream. 'console' for stdout/stderr, 'system' for container events.", + arg_type=get_enum_type(['console', 'system']), + default='console' + ) + c.argument( + 'tail', + type=int, + help='Number of trailing log lines to fetch (1-300). Default: 50', + default=50 + ) + c.argument( + 'follow', + options_list=['--follow', '-f'], + action='store_true', + help='Stream logs in real-time. Without this flag, fetches recent logs and exits.' + ) + with self.argument_context('cognitiveservices') as c: c.argument('account_name', arg_type=name_arg_type, help='cognitive service account name', completer=get_resource_name_completion_list('Microsoft.CognitiveServices/accounts')) diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/commands.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/commands.py index 3432cd5f514..d9a33bec661 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/commands.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/commands.py @@ -129,6 +129,10 @@ def load_command_table(self, _): g.custom_command('list', 'agent_list') g.custom_command('list-versions', 'agent_versions_list') g.custom_show_command('show', 'agent_show') + + with self.command_group('cognitiveservices agent logs', client_factory=cf_ai_projects, is_preview=True) as g: + g.custom_show_command('show', 'agent_logs_show') + with self.command_group( 'cognitiveservices account project', projects_type, client_factory=cf_projects) as g: diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py index 9db5eb1c7ff..9521ae57017 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py @@ -908,6 +908,162 @@ def _get_agent_container_status(client, agent_name, agent_version): return response.json() +# Constants for log streaming +LOG_STREAM_CONNECT_TIMEOUT = 10 # seconds +LOG_STREAM_READ_TIMEOUT = 5 # seconds for non-follow mode +LOG_STREAM_RETRY_INTERVAL = 5 # seconds between retries +LOG_STREAM_MAX_RETRIES = 30 # max retry attempts (~2.5 minutes) +LOG_STREAM_POST_DEPLOY_WAIT = 15 # seconds to stream after deployment ready + + +def _get_log_stream_auth_header(cmd): + """ + Get authorization header for log stream API. + + Args: + cmd: CLI command context + + Returns: + dict: Authorization header with Bearer token + """ + from azure.cli.core._profile import Profile + + profile = Profile(cli_ctx=cmd.cli_ctx) + credential, _, _ = profile.get_login_credentials( + subscription_id=cmd.cli_ctx.data.get("subscription_id") + ) + token = credential.get_token("https://ai.azure.com/.default") + return {"Authorization": f"Bearer {token.token}"} + + +def _build_log_stream_url(client, agent_name, agent_version, container_name="default"): + """ + Build the log stream URL for an agent container. + + Args: + client: Service client with endpoint configuration + agent_name: Name of the agent + agent_version: Version of the agent + container_name: Container name (default: 'default') + + Returns: + str: Full URL for the log stream endpoint + """ + endpoint = client._config.endpoint # pylint: disable=protected-access + return ( + f"{endpoint}/agents/{urllib.parse.quote(agent_name)}" + f"/versions/{urllib.parse.quote(str(agent_version))}" + f"/containers/{urllib.parse.quote(container_name)}:logstream" + ) + + +def _stream_agent_logs( + cmd, + client, + agent_name, + agent_version, + kind="console", + tail=50, + follow=True, +): + """ + Stream logs from an agent container. + + Args: + cmd: CLI command context + client: Service client (AIProjectClient) + agent_name: Name of the agent + agent_version: Version of the agent + kind: Type of logs - 'console' (stdout/stderr) or 'system' (container events) + tail: Number of trailing lines to fetch (1-300) + follow: Whether to stream logs in real-time + + Yields: + str: Log lines as they arrive + + Raises: + InvalidArgumentValueError: If tail or kind parameters are invalid + AzureResponseError: If connection to log stream fails + """ + import requests as http_requests + + # Validate parameters + if tail is not None and not 1 <= tail <= 300: + raise InvalidArgumentValueError("--tail must be between 1 and 300") + if kind not in ("console", "system"): + raise InvalidArgumentValueError("--type must be 'console' or 'system'") + + log_url = _build_log_stream_url(client, agent_name, agent_version) + params = { + "api-version": AGENT_API_VERSION_PARAMS["api-version"], + "kind": kind, + "tail": tail, + } + headers = _get_log_stream_auth_header(cmd) + + logger.info("Connecting to log stream: %s", log_url) + + timeout = None if follow else (LOG_STREAM_CONNECT_TIMEOUT, LOG_STREAM_READ_TIMEOUT) + + try: + response = http_requests.get( + log_url, params=params, headers=headers, stream=True, timeout=timeout + ) + + if not response.ok: + error_detail = response.text or f"HTTP {response.status_code}" + raise AzureResponseError(f"Failed to connect to log stream: {error_detail}") + + for line in response.iter_lines(): + if line: + yield line.decode("utf-8", errors="replace") + + except http_requests.exceptions.Timeout: + pass # Expected when follow=False - read timeout after fetching available logs + except http_requests.exceptions.ConnectionError as e: + if "timed out" in str(e).lower(): + pass # Timeout wrapped in ConnectionError + else: + raise AzureResponseError(f"Failed to connect to log stream: {e}") from e + except KeyboardInterrupt: + logger.warning("Log streaming interrupted by user") + raise + + +def agent_logs_show( + cmd, + client, + account_name, + project_name, + agent_name, + agent_version, + kind="console", + tail=50, + follow=False, +): # pylint: disable=unused-argument + """ + Show logs from a hosted agent container. + + Args: + cmd: CLI command context + client: Service client + account_name: Cognitive Services account name (unused, for CLI routing) + project_name: AI Foundry project name (unused, for CLI routing) + agent_name: Name of the agent + agent_version: Version of the agent + kind: Type of logs - 'console' or 'system' + tail: Number of trailing lines (1-300) + follow: Stream logs in real-time if True + """ + try: + for log_line in _stream_agent_logs( + cmd, client, agent_name, agent_version, kind=kind, tail=tail, follow=follow + ): + print(log_line) + except KeyboardInterrupt: + pass # Clean exit on Ctrl+C + + def _wait_for_agent_deployment_ready( cmd, client, agent_name, agent_version, timeout=600, poll_interval=5): """ @@ -1208,7 +1364,98 @@ def _build_and_push_locally(): return image -def _deploy_agent_version(cmd, client, agent_name, created_version, min_replicas, max_replicas, timeout=600): +class _BackgroundLogStreamer: + """ + Context manager for streaming logs in a background thread during deployment. + + Usage: + with _BackgroundLogStreamer(cmd, client, agent_name, version) as streamer: + # deployment operations... + streamer.wait_after_ready() # optional: stream logs after deployment ready + """ + + def __init__(self, cmd, client, agent_name, agent_version, enabled=True): + self.cmd = cmd + self.client = client + self.agent_name = agent_name + self.agent_version = agent_version + self.enabled = enabled + self._thread = None + self._stop_event = None + + def __enter__(self): + if not self.enabled: + return self + + import threading + self._stop_event = threading.Event() + self._thread = threading.Thread(target=self._stream_with_retry, daemon=True) + self._thread.start() + logger.warning("Streaming container logs (Ctrl+C to stop)...") + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._thread and self._stop_event: + self._stop_event.set() + self._thread.join(timeout=2) + return False # Don't suppress exceptions + + def _stream_with_retry(self): + """Stream logs with retry logic for container startup.""" + import time + + for attempt in range(LOG_STREAM_MAX_RETRIES): + if self._stop_event.is_set(): + return + + try: + # Check if container is in a streamable state + if not self._is_container_ready(): + time.sleep(LOG_STREAM_RETRY_INTERVAL) + continue + + # Stream logs + for log_line in _stream_agent_logs( + self.cmd, self.client, self.agent_name, self.agent_version, + kind="console", tail=100, follow=True + ): + if self._stop_event.is_set(): + return + print(log_line) + return # Successfully streamed + + except Exception as e: # pylint: disable=broad-except + if self._stop_event.is_set(): + return + logger.debug("Log stream attempt %d failed: %s", attempt + 1, e) + if attempt < LOG_STREAM_MAX_RETRIES - 1: + time.sleep(LOG_STREAM_RETRY_INTERVAL) + + def _is_container_ready(self): + """Check if container is in a state where logs can be streamed.""" + try: + status = _get_agent_container_status(self.client, self.agent_name, self.agent_version) + return status.get("status", "").lower() in ("running", "starting", "pending") + except Exception: # pylint: disable=broad-except + return True # Try streaming anyway if status check fails + + def wait_after_ready(self, seconds=LOG_STREAM_POST_DEPLOY_WAIT): + """Wait for additional log streaming after deployment is ready.""" + import time + + if not self.enabled or not self._thread or not self._thread.is_alive(): + return + + logger.warning("Deployment ready. Streaming logs for %d more seconds (Ctrl+C to stop)...", seconds) + for _ in range(seconds): + if not self._thread.is_alive(): + break + time.sleep(1) + + +def _deploy_agent_version( + cmd, client, agent_name, created_version, min_replicas, max_replicas, timeout=600, show_logs=False +): """ Deploy an agent version with horizontal scaling configuration. @@ -1220,6 +1467,7 @@ def _deploy_agent_version(cmd, client, agent_name, created_version, min_replicas min_replicas: Minimum number of replicas (default 0) max_replicas: Maximum number of replicas (default 3) timeout: Maximum time to wait for deployment (default 600 seconds) + show_logs: Stream container logs during deployment (default False) """ effective_min_replicas = min_replicas if min_replicas is not None else 0 effective_max_replicas = max_replicas if max_replicas is not None else 3 @@ -1229,35 +1477,26 @@ def _deploy_agent_version(cmd, client, agent_name, created_version, min_replicas effective_min_replicas, effective_max_replicas, ) - try: - _invoke_agent_container_operation( - client, - agent_name, - created_version, - action="start", - ) - _wait_for_agent_deployment_ready(cmd, client, agent_name, created_version, timeout=timeout) + with _BackgroundLogStreamer(cmd, client, agent_name, created_version, enabled=show_logs) as streamer: + try: + _invoke_agent_container_operation(client, agent_name, created_version, action="start") + _wait_for_agent_deployment_ready(cmd, client, agent_name, created_version, timeout=timeout) - if min_replicas is not None or max_replicas is not None: - _invoke_agent_container_operation( - client, - agent_name, - created_version, - action="update", - min_replicas=effective_min_replicas, - max_replicas=effective_max_replicas, - ) + if min_replicas is not None or max_replicas is not None: + _invoke_agent_container_operation( + client, agent_name, created_version, action="update", + min_replicas=effective_min_replicas, max_replicas=effective_max_replicas + ) - logger.info("Agent deployment started successfully") - except Exception as deploy_err: - recommendation = ( - "Use 'az cognitiveservices agent start' to retry deployment once the underlying issue is resolved." - ) - raise DeploymentError( - f"Agent version '{created_version}' was created but deployment failed: {deploy_err}", - recommendation=recommendation, - ) from deploy_err + logger.info("Agent deployment started successfully") + streamer.wait_after_ready() + + except Exception as deploy_err: + raise DeploymentError( + f"Agent version '{created_version}' was created but deployment failed: {deploy_err}", + recommendation="Use 'az cognitiveservices agent start' to retry deployment." + ) from deploy_err def agent_update( @@ -1298,14 +1537,33 @@ def agent_stop( def agent_start( - client, account_name, project_name, agent_name, agent_version + cmd, client, account_name, project_name, agent_name, agent_version, show_logs=False, timeout=600 ): # pylint: disable=unused-argument """ Start hosted agent deployment. + + Args: + cmd: CLI command context + client: Service client + account_name: Cognitive Services account name (unused, for CLI routing) + project_name: AI Foundry project name (unused, for CLI routing) + agent_name: Name of the agent + agent_version: Version of the agent to start + show_logs: Stream container logs during startup (default False) + timeout: Maximum time to wait for deployment to be ready (default 600 seconds) """ - return _invoke_agent_container_operation( - client, agent_name, agent_version, action="start" - ) + result = _invoke_agent_container_operation(client, agent_name, agent_version, action="start") + + if show_logs: + with _BackgroundLogStreamer(cmd, client, agent_name, agent_version) as streamer: + try: + _wait_for_agent_deployment_ready(cmd, client, agent_name, agent_version, timeout=timeout) + logger.warning("Agent deployment is now running") + streamer.wait_after_ready() + except KeyboardInterrupt: + logger.warning("Log streaming interrupted") + + return result def agent_delete_deployment( @@ -1621,6 +1879,7 @@ def agent_create( # pylint: disable=too-many-locals no_wait=False, no_start=False, timeout=600, + show_logs=False, ): """ Create a new hosted agent from a container image or source code. @@ -1652,6 +1911,7 @@ def agent_create( # pylint: disable=too-many-locals no_wait: Don't wait for operation completion (default False) no_start: Skip automatic deployment after version creation (default False) timeout: Maximum time in seconds to wait for deployment (default 600) + show_logs: Stream container logs during deployment (default False) Returns: dict: Created agent version details including status, version, and configuration @@ -1779,6 +2039,7 @@ def agent_create( # pylint: disable=too-many-locals min_replicas, max_replicas, timeout=timeout, + show_logs=show_logs, ) elif created_version and no_start: logger.info("Agent version created but not deployed (--no-start specified). " diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/tests/latest/test_agent.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/tests/latest/test_agent.py index 534545e2801..09c94f2263d 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/tests/latest/test_agent.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/tests/latest/test_agent.py @@ -298,6 +298,78 @@ def test_validate_path_for_subprocess_empty_path(self): with self.assertRaises(InvalidArgumentValueError): _validate_path_for_subprocess(None, "test path") + # ========================================================================= + # Tests for agent logs functionality + # ========================================================================= + + def test_stream_agent_logs_function_signature(self): + """Test that _stream_agent_logs has correct parameters.""" + from inspect import signature + from azure.cli.command_modules.cognitiveservices.custom import _stream_agent_logs + + sig = signature(_stream_agent_logs) + self.assertIn('cmd', sig.parameters) + self.assertIn('client', sig.parameters) + self.assertIn('agent_name', sig.parameters) + self.assertIn('agent_version', sig.parameters) + self.assertIn('kind', sig.parameters) + self.assertIn('tail', sig.parameters) + self.assertIn('follow', sig.parameters) + # Verify defaults + self.assertEqual(sig.parameters['kind'].default, "console") + self.assertEqual(sig.parameters['tail'].default, 50) + self.assertEqual(sig.parameters['follow'].default, True) + + def test_agent_logs_show_function_signature(self): + """Test that agent_logs_show has correct parameters.""" + from inspect import signature + from azure.cli.command_modules.cognitiveservices.custom import agent_logs_show + + sig = signature(agent_logs_show) + self.assertIn('cmd', sig.parameters) + self.assertIn('client', sig.parameters) + self.assertIn('account_name', sig.parameters) + self.assertIn('project_name', sig.parameters) + self.assertIn('agent_name', sig.parameters) + self.assertIn('agent_version', sig.parameters) + self.assertIn('kind', sig.parameters) + self.assertIn('tail', sig.parameters) + self.assertIn('follow', sig.parameters) + # Verify follow defaults to False for non-streaming behavior + self.assertEqual(sig.parameters['follow'].default, False) + self.assertEqual(sig.parameters['kind'].default, "console") + self.assertEqual(sig.parameters['tail'].default, 50) + + def test_agent_start_show_logs_parameter(self): + """Test that agent_start accepts show_logs and timeout parameters.""" + from inspect import signature + from azure.cli.command_modules.cognitiveservices.custom import agent_start + + sig = signature(agent_start) + self.assertIn('cmd', sig.parameters) + self.assertIn('show_logs', sig.parameters) + self.assertIn('timeout', sig.parameters) + self.assertEqual(sig.parameters['show_logs'].default, False) + self.assertEqual(sig.parameters['timeout'].default, 600) + + def test_agent_create_show_logs_parameter(self): + """Test that agent_create accepts show_logs parameter.""" + from inspect import signature + from azure.cli.command_modules.cognitiveservices.custom import agent_create + + sig = signature(agent_create) + self.assertIn('show_logs', sig.parameters) + self.assertEqual(sig.parameters['show_logs'].default, False) + + def test_deploy_agent_version_show_logs_parameter(self): + """Test that _deploy_agent_version accepts show_logs parameter.""" + from inspect import signature + from azure.cli.command_modules.cognitiveservices.custom import _deploy_agent_version + + sig = signature(_deploy_agent_version) + self.assertIn('show_logs', sig.parameters) + self.assertEqual(sig.parameters['show_logs'].default, False) + class CognitiveServicesAgentTests(ScenarioTest): """ @@ -820,6 +892,161 @@ def test_agent_create_errors(self, resource_group): # Cleanup self.cmd('az cognitiveservices account delete -n {account} -g {rg}') + # ========================================================================= + # Integration tests for agent logs functionality + # ========================================================================= + + @live_only() + @serial_test() + @ResourceGroupPreparer(location='eastus') + def test_agent_logs_show_basic(self, resource_group): + """ + Test basic log streaming without --follow flag. + + Validates: + - Log command executes without error + - Default parameters (console type, 50 lines tail) + - Command exits after fetching initial logs + """ + account_name = self.create_random_name(prefix='cs_logs_', length=20) + project_name = self.create_random_name(prefix='proj_', length=15) + agent_name = 'test-logs-agent' + + self.kwargs.update({ + 'account': account_name, + 'project': project_name, + 'agent': agent_name, + 'kind': 'AIServices', + 'sku': 'S0', + 'location': 'eastus', + 'image': 'mcr.microsoft.com/azuredocs/aci-helloworld:latest' + }) + + # Create Cognitive Services account + self.cmd('az cognitiveservices account create -n {account} -g {rg} ' + '--kind {kind} --sku {sku} -l {location} --yes --manage-projects', + checks=[self.check('properties.provisioningState', 'Succeeded')]) + + # Create agent with a sample image + self.cmd('az cognitiveservices agent create --skip-acr-check ' + '-a {account} --project-name {project} --name {agent} ' + '--image {image}', + checks=[self.check('name', '{agent}')]) + + # Fetch logs without follow (should return and exit) + # This verifies the command runs successfully + self.cmd('az cognitiveservices agent logs show ' + '-a {account} -p {project} -n {agent} --agent-version 1') + + # Cleanup + self.cmd('az cognitiveservices agent delete -a {account} -p {project} -n {agent}') + self.cmd('az cognitiveservices account delete -n {account} -g {rg}') + + @live_only() + @serial_test() + @ResourceGroupPreparer(location='eastus') + def test_agent_logs_show_with_options(self, resource_group): + """ + Test log streaming with various options. + + Validates: + - --type system option + - --tail custom value + - Different log type outputs + """ + account_name = self.create_random_name(prefix='cs_logs_', length=20) + project_name = self.create_random_name(prefix='proj_', length=15) + agent_name = 'test-logs-opts' + + self.kwargs.update({ + 'account': account_name, + 'project': project_name, + 'agent': agent_name, + 'kind': 'AIServices', + 'sku': 'S0', + 'location': 'eastus', + 'image': 'mcr.microsoft.com/azuredocs/aci-helloworld:latest' + }) + + # Create Cognitive Services account + self.cmd('az cognitiveservices account create -n {account} -g {rg} ' + '--kind {kind} --sku {sku} -l {location} --yes --manage-projects', + checks=[self.check('properties.provisioningState', 'Succeeded')]) + + # Create agent + self.cmd('az cognitiveservices agent create --skip-acr-check ' + '-a {account} --project-name {project} --name {agent} ' + '--image {image}', + checks=[self.check('name', '{agent}')]) + + # Test with --type system + self.cmd('az cognitiveservices agent logs show ' + '-a {account} -p {project} -n {agent} --agent-version 1 ' + '--type system') + + # Test with --tail custom value + self.cmd('az cognitiveservices agent logs show ' + '-a {account} -p {project} -n {agent} --agent-version 1 ' + '--tail 100') + + # Test with both options + self.cmd('az cognitiveservices agent logs show ' + '-a {account} -p {project} -n {agent} --agent-version 1 ' + '--type console --tail 200') + + # Cleanup + self.cmd('az cognitiveservices agent delete -a {account} -p {project} -n {agent}') + self.cmd('az cognitiveservices account delete -n {account} -g {rg}') + + @live_only() + @serial_test() + @ResourceGroupPreparer(location='eastus') + def test_agent_start_with_show_logs(self, resource_group): + """ + Test agent start with --show-logs flag. + + Validates: + - Agent can be stopped and started + - --show-logs flag streams logs during startup + """ + account_name = self.create_random_name(prefix='cs_start_', length=20) + project_name = self.create_random_name(prefix='proj_', length=15) + agent_name = 'test-start-logs' + + self.kwargs.update({ + 'account': account_name, + 'project': project_name, + 'agent': agent_name, + 'kind': 'AIServices', + 'sku': 'S0', + 'location': 'eastus', + 'image': 'mcr.microsoft.com/azuredocs/aci-helloworld:latest' + }) + + # Create Cognitive Services account + self.cmd('az cognitiveservices account create -n {account} -g {rg} ' + '--kind {kind} --sku {sku} -l {location} --yes --manage-projects', + checks=[self.check('properties.provisioningState', 'Succeeded')]) + + # Create agent + self.cmd('az cognitiveservices agent create --skip-acr-check ' + '-a {account} --project-name {project} --name {agent} ' + '--image {image}', + checks=[self.check('name', '{agent}')]) + + # Stop the agent first + self.cmd('az cognitiveservices agent stop ' + '-a {account} -p {project} -n {agent} --agent-version 1') + + # Start with --show-logs + self.cmd('az cognitiveservices agent start ' + '-a {account} -p {project} -n {agent} --agent-version 1 ' + '--show-logs --timeout 120') + + # Cleanup + self.cmd('az cognitiveservices agent delete -a {account} -p {project} -n {agent}') + self.cmd('az cognitiveservices account delete -n {account} -g {rg}') + if __name__ == '__main__': unittest.main() From bc04891a82d41e50d24b2d746f188875ecc0d8e2 Mon Sep 17 00:00:00 2001 From: Eamon O'Reilly Date: Mon, 26 Jan 2026 16:25:24 -0800 Subject: [PATCH 2/2] Address review: add user warning when log streaming fails - Distinguish transient errors (ConnectionError, Timeout) from unexpected errors - Track last error and show warning to user after all retries exhausted - Provides actionable feedback when --show-logs cannot establish connection --- .../cognitiveservices/custom.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py b/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py index 9521ae57017..443575453e7 100644 --- a/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py +++ b/src/azure-cli/azure/cli/command_modules/cognitiveservices/custom.py @@ -1403,7 +1403,9 @@ def __exit__(self, exc_type, exc_val, exc_tb): def _stream_with_retry(self): """Stream logs with retry logic for container startup.""" import time + from requests.exceptions import ConnectionError as RequestsConnectionError, Timeout + last_error = None for attempt in range(LOG_STREAM_MAX_RETRIES): if self._stop_event.is_set(): return @@ -1424,13 +1426,32 @@ def _stream_with_retry(self): print(log_line) return # Successfully streamed + except (RequestsConnectionError, Timeout) as e: + # Expected transient errors during container startup + if self._stop_event.is_set(): + return + last_error = e + logger.debug("Log stream attempt %d failed (transient): %s", attempt + 1, e) + if attempt < LOG_STREAM_MAX_RETRIES - 1: + time.sleep(LOG_STREAM_RETRY_INTERVAL) + except Exception as e: # pylint: disable=broad-except + # Unexpected errors - log and continue retrying if self._stop_event.is_set(): return + last_error = e logger.debug("Log stream attempt %d failed: %s", attempt + 1, e) if attempt < LOG_STREAM_MAX_RETRIES - 1: time.sleep(LOG_STREAM_RETRY_INTERVAL) + # All retries exhausted - warn user + if last_error and not self._stop_event.is_set(): + logger.warning( + "Unable to establish log stream after %d attempts. " + "The agent may still be starting. Last error: %s", + LOG_STREAM_MAX_RETRIES, last_error + ) + def _is_container_ready(self): """Check if container is in a state where logs can be streamed.""" try: