From aece5d8cfb32cc32329ad73f23d9684790f2d781 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 13:41:47 -0700
Subject: [PATCH 01/19] feat(connect): add sqlID/jobID reverse indexes on
 AppBase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prep for Phase 3 reporting — indexes get populated from SQLExecutionStart.jobTags
and JobStart.spark.job.tags. Issue #2065.
---
 .../spark/sql/rapids/tool/AppBase.scala       |  4 ++
 .../profiling/ConnectCorrelationSuite.scala   | 70 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
index 203e3bb12..6e734b109 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
@@ -143,6 +143,10 @@ abstract class AppBase(
   val connectOperations: HashMap[String, ConnectOperationInfo] = HashMap.empty
   // jobTag -> operationId index for correlation with SQL executions and jobs.
   val jobTagToConnectOpId: HashMap[String, String] = HashMap.empty
+  // operationId -> sqlIDs discovered via SparkListenerSQLExecutionStart.jobTags.
+  val operationIdToSqlIds: HashMap[String, mutable.Set[Long]] = HashMap.empty
+  // operationId -> jobIDs discovered via SparkListenerJobStart.properties["spark.job.tags"].
+  val operationIdToJobIds: HashMap[String, mutable.Set[Int]] = HashMap.empty
   def isConnectMode: Boolean = connectOperations.nonEmpty
 
   def sqlPlans: immutable.Map[Long, SparkPlanInfo] = sqlManager.getPlanInfos
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
new file mode 100644
index 000000000..90e6b9dcd
--- /dev/null
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
+
+import com.nvidia.spark.rapids.BaseNoSparkSuite
+import com.nvidia.spark.rapids.tool.EventLogPathProcessor
+
+import org.apache.spark.sql.TrampolineUtil
+import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
+import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
+
+/**
+ * Tests for Spark Connect sqlID/jobID correlation indexes on AppBase.
+ * Task 1 only verifies the reverse-index HashMaps exist and are initialized
+ * empty on a fresh app. Later tasks populate them from
+ * SparkListenerSQLExecutionStart.jobTags and
+ * SparkListenerJobStart.properties["spark.job.tags"].
+ */
+class ConnectCorrelationSuite extends BaseNoSparkSuite {
+
+  private val hadoopConf = RapidsToolsConfUtil.newHadoopConf()
+
+  private val logStartEvent =
+    """{"Event":"SparkListenerLogStart","Spark Version":"3.5.0"}"""
+  private val appStartEvent =
+    """{"Event":"SparkListenerApplicationStart","App Name":"CorrelationTest",""" +
+      """"App ID":"local-correlation","Timestamp":100000,"User":"testUser"}"""
+  private val envUpdateEvent =
+    """{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{},""" +
+      """"Spark Properties":{"spark.master":"local[*]"},""" +
+      """"Hadoop Properties":{},"System Properties":{"file.encoding":"UTF-8"},""" +
+      """"Classpath Entries":{}}"""
+  private val appEndEvent =
+    """{"Event":"SparkListenerApplicationEnd","Timestamp":200000}"""
+
+  private def withEventLog(events: String*)(verify: ApplicationInfo => Unit): Unit = {
+    val content = events.mkString("\n")
+    TrampolineUtil.withTempDir { tempDir =>
+      val path = Paths.get(tempDir.getAbsolutePath, "test_eventlog")
+      Files.write(path, content.getBytes(StandardCharsets.UTF_8))
+      val app = new ApplicationInfo(hadoopConf,
+        EventLogPathProcessor.getEventLogInfo(path.toString, hadoopConf).head._1)
+      verify(app)
+    }
+  }
+
+  test("operationIdToSqlIds / operationIdToJobIds are initialized empty on AppBase") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      assert(app.operationIdToSqlIds.isEmpty)
+      assert(app.operationIdToJobIds.isEmpty)
+    }
+  }
+}
\ No newline at end of file

From 20e7046e99140d9096872233e527552f5d714ee5 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 13:57:58 -0700
Subject: [PATCH 02/19] feat(connect): populate operationIdToSqlIds from
 SQLExecutionStart.jobTags

Uses reflective accessor to stay compatible with Spark 3.2-3.4 profiles.
Also tightens operationIdTo{Sql,Job}Ids value type to mutable.HashSet for
consistency with neighboring collections on AppBase. Issue #2065.
---
 .../spark/sql/rapids/tool/AppBase.scala       |  4 +-
 .../sql/rapids/tool/EventProcessorBase.scala  | 12 +++
 .../sql/rapids/tool/util/EventUtils.scala     | 10 +++
 .../profiling/ConnectCorrelationSuite.scala   | 85 +++++++++++++++++--
 4 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
index 6e734b109..c74440dad 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
@@ -144,9 +144,9 @@ abstract class AppBase(
   // jobTag -> operationId index for correlation with SQL executions and jobs.
   val jobTagToConnectOpId: HashMap[String, String] = HashMap.empty
   // operationId -> sqlIDs discovered via SparkListenerSQLExecutionStart.jobTags.
-  val operationIdToSqlIds: HashMap[String, mutable.Set[Long]] = HashMap.empty
+  val operationIdToSqlIds: HashMap[String, HashSet[Long]] = HashMap.empty
   // operationId -> jobIDs discovered via SparkListenerJobStart.properties["spark.job.tags"].
-  val operationIdToJobIds: HashMap[String, mutable.Set[Int]] = HashMap.empty
+  val operationIdToJobIds: HashMap[String, HashSet[Int]] = HashMap.empty
   def isConnectMode: Boolean = connectOperations.nonEmpty
 
   def sqlPlans: immutable.Map[Long, SparkPlanInfo] = sqlManager.getPlanInfos
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
index def5ecfd0..2e7b3345b 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
@@ -16,6 +16,7 @@
 
 package org.apache.spark.sql.rapids.tool
 
+import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
@@ -168,6 +169,17 @@ abstract class EventProcessorBase[T <: AppBase](app: T) extends SparkListener wi
     // redaction and predicate updates (gpuMode, etc.).
     // Last-write-wins if multiple SQL executions have different modifiedConfigs.
     app.mergeModifiedConfigs(modifiedConfigs)
+
+    // Correlate Connect operations to this sqlID via jobTags (Spark 3.5+).
+    if (app.isConnectMode) {
+      EventUtils.readJobTagsFromSQLStartEvent(event).foreach { tag =>
+        app.jobTagToConnectOpId.get(tag).foreach { opId =>
+          app.operationIdToSqlIds
+            .getOrElseUpdate(opId, mutable.HashSet.empty[Long])
+            .add(event.executionId)
+        }
+      }
+    }
   }
 
   def doSparkListenerSQLExecutionEnd(
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
index 6f7eeb84f..ece78b5fe 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
@@ -210,6 +210,16 @@ object EventUtils extends Logging {
     }.toOption.flatten.getOrElse(Map.empty)
   }
 
+  // Reads jobTags via reflection (Spark 3.5+, introduced for Connect support).
+  // Returns empty set on older versions.
+  def readJobTagsFromSQLStartEvent(
+      event: SparkListenerSQLExecutionStart): Set[String] = {
+    Try {
+      Option(invokeMethodOnEvent(event, "jobTags"))
+        .map(_.asInstanceOf[Set[String]])
+    }.toOption.flatten.getOrElse(Set.empty)
+  }
+
   @throws[com.fasterxml.jackson.core.JsonParseException]
   private def handleEventJsonParseEx(
       ex: com.fasterxml.jackson.core.JsonParseException): Unit = {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
index 90e6b9dcd..0440a4013 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
@@ -19,19 +19,23 @@ package com.nvidia.spark.rapids.tool.profiling
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, Paths}
 
+import scala.collection.mutable
+
 import com.nvidia.spark.rapids.BaseNoSparkSuite
 import com.nvidia.spark.rapids.tool.EventLogPathProcessor
 
 import org.apache.spark.sql.TrampolineUtil
+import org.apache.spark.sql.execution.SparkPlanInfo
+import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart
 import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
 import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
 
 /**
  * Tests for Spark Connect sqlID/jobID correlation indexes on AppBase.
- * Task 1 only verifies the reverse-index HashMaps exist and are initialized
- * empty on a fresh app. Later tasks populate them from
- * SparkListenerSQLExecutionStart.jobTags and
- * SparkListenerJobStart.properties["spark.job.tags"].
+ * Task 1 verifies the reverse-index HashMaps exist and are initialized
+ * empty on a fresh app. Task 2 verifies operationIdToSqlIds is populated
+ * from SparkListenerSQLExecutionStart.jobTags. Task 3 will populate
+ * operationIdToJobIds from SparkListenerJobStart.properties["spark.job.tags"].
  */
 class ConnectCorrelationSuite extends BaseNoSparkSuite {
 
@@ -61,10 +65,81 @@ class ConnectCorrelationSuite extends BaseNoSparkSuite {
     }
   }
 
+  /**
+   * True when the running Spark profile's SparkListenerSQLExecutionStart has a
+   * jobTags accessor (Spark 3.5+). Used to skip tests on older profiles.
+   */
+  private def checkJobTagsAvailable(): (Boolean, String) = {
+    val available = try {
+      classOf[SparkListenerSQLExecutionStart].getMethod("jobTags")
+      true
+    } catch {
+      case _: NoSuchMethodException => false
+    }
+    (available, "SparkListenerSQLExecutionStart.jobTags requires Spark 3.5+")
+  }
+
+  /**
+   * Builds a SparkListenerSQLExecutionStart with the given jobTags via reflection,
+   * matching the 9-arg constructor introduced in Spark 3.5.
+   */
+  private def buildSQLStartEvent(executionId: Long, jobTags: Set[String])
+      : SparkListenerSQLExecutionStart = {
+    val planInfo = new SparkPlanInfo(
+      "TestNode", "test", Nil, Map.empty[String, String], Nil)
+    val ctors = classOf[SparkListenerSQLExecutionStart].getConstructors
+    val ctor = ctors.find(_.getParameterCount == 9).getOrElse(
+      throw new AssertionError("Expected 9-arg SparkListenerSQLExecutionStart constructor"))
+    ctor.newInstance(
+      java.lang.Long.valueOf(executionId),
+      None,
+      "desc",
+      "details",
+      "physicalPlan",
+      planInfo,
+      java.lang.Long.valueOf(123000L),
+      Map.empty[String, String],
+      jobTags).asInstanceOf[SparkListenerSQLExecutionStart]
+  }
+
   test("operationIdToSqlIds / operationIdToJobIds are initialized empty on AppBase") {
     withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
       assert(app.operationIdToSqlIds.isEmpty)
       assert(app.operationIdToJobIds.isEmpty)
     }
   }
-}
\ No newline at end of file
+
+  runConditionalTest(
+    "operationIdToSqlIds populated from SparkListenerSQLExecutionStart.jobTags",
+    checkJobTagsAvailable) {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      // Manually seed Connect state as if a ConnectOperationStarted event had fired.
+      val jobTag =
+        "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1"
+      app.connectOperations.put("op-1", new ConnectOperationInfo(
+        operationId = "op-1",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = jobTag,
+        statementText = "range(0, 10)",
+        startTime = 110000L))
+      app.jobTagToConnectOpId.put(jobTag, "op-1")
+      assert(app.isConnectMode, "Should detect Connect mode after seeding")
+
+      // Drive a SparkListenerSQLExecutionStart tagged with the Connect operation.
+      val evt = buildSQLStartEvent(executionId = 42L, jobTags = Set(jobTag))
+      app.processEvent(evt)
+
+      assert(app.operationIdToSqlIds.contains("op-1"),
+        "operationIdToSqlIds should contain op-1 after SQL start")
+      assert(app.operationIdToSqlIds("op-1").contains(42L),
+        "op-1 should map to executionId 42")
+
+      // An untagged SQL execution should not map to any Connect op.
+      val untagged = buildSQLStartEvent(executionId = 43L, jobTags = Set.empty)
+      app.processEvent(untagged)
+      assert(app.operationIdToSqlIds("op-1") == mutable.HashSet(42L),
+        "Untagged execution should not be attributed to op-1")
+    }
+  }
+}

From 8c855dc1e936beb40410ba16f3818880ee8651df Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 14:10:03 -0700
Subject: [PATCH 03/19] feat(connect): populate operationIdToJobIds from
 JobStart spark.job.tags

Issue #2065.
---
 .../sql/rapids/tool/EventProcessorBase.scala  | 14 ++++++
 .../profiling/ConnectCorrelationSuite.scala   | 47 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
index 2e7b3345b..b554d184c 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
@@ -430,6 +430,20 @@ abstract class EventProcessorBase[T <: AppBase](app: T) extends SparkListener wi
       app.sqlIdToStages.getOrElseUpdate(sqlID.get, ArrayBuffer.empty) ++= event.stageIds
     }
     sqlID.foreach(app.jobIdToSqlID(event.jobId) = _)
+
+    // Correlate Connect operations to this jobID via spark.job.tags.
+    if (app.isConnectMode) {
+      val tagStr = event.properties.getProperty("spark.job.tags")
+      if (tagStr != null && tagStr.nonEmpty) {
+        tagStr.split(",").iterator.map(_.trim).foreach { tag =>
+          app.jobTagToConnectOpId.get(tag).foreach { opId =>
+            app.operationIdToJobIds
+              .getOrElseUpdate(opId, mutable.HashSet.empty[Int])
+              .add(event.jobId)
+          }
+        }
+      }
+    }
   }
 
   override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
index 0440a4013..84c8a84b9 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable
 import com.nvidia.spark.rapids.BaseNoSparkSuite
 import com.nvidia.spark.rapids.tool.EventLogPathProcessor
 
+import org.apache.spark.scheduler.SparkListenerJobStart
 import org.apache.spark.sql.TrampolineUtil
 import org.apache.spark.sql.execution.SparkPlanInfo
 import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart
@@ -142,4 +143,50 @@ class ConnectCorrelationSuite extends BaseNoSparkSuite {
         "Untagged execution should not be attributed to op-1")
     }
   }
+
+  test("operationIdToJobIds populated from SparkListenerJobStart spark.job.tags") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      // Seed Connect state as if a ConnectOperationStarted event had fired.
+      val jobTag =
+        "SparkConnect_OperationTag_User_u_Session_s_Operation_op-2"
+      app.connectOperations.put("op-2", new ConnectOperationInfo(
+        operationId = "op-2",
+        sessionId = "s",
+        userId = "u",
+        jobTag = jobTag,
+        statementText = "range(0, 10)",
+        startTime = 110000L))
+      app.jobTagToConnectOpId.put(jobTag, "op-2")
+      assert(app.isConnectMode, "Should detect Connect mode after seeding")
+
+      // Simulate a JobStart whose spark.job.tags mixes the Connect tag with a
+      // user-supplied tag (e.g., from spark.addTag).
+      val props = new java.util.Properties()
+      props.setProperty("spark.job.tags", s"$jobTag,custom-user-tag")
+      val evt = SparkListenerJobStart(
+        jobId = 7, time = 2000L, stageInfos = Nil, properties = props)
+      app.processEvent(evt)
+
+      assert(app.operationIdToJobIds("op-2") == mutable.HashSet(7),
+        "op-2 should map to exactly jobId 7")
+      assert(app.operationIdToJobIds.size == 1,
+        "No spurious opIds should be created from user tags")
+    }
+  }
+
+  test("operationIdToJobIds stays empty when app is not in Connect mode") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      assert(!app.isConnectMode, "Fresh app should not be in Connect mode")
+
+      val props = new java.util.Properties()
+      props.setProperty("spark.job.tags",
+        "SparkConnect_OperationTag_User_u_Session_s_Operation_op-x")
+      val evt = SparkListenerJobStart(
+        jobId = 8, time = 3000L, stageInfos = Nil, properties = props)
+      app.processEvent(evt)
+
+      assert(app.operationIdToJobIds.isEmpty,
+        "Non-Connect app should not populate operationIdToJobIds")
+    }
+  }
 }

From 1ef9ae0755acdca9d4cb06609ce19b3187c4511e Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 14:59:43 -0700
Subject: [PATCH 04/19] feat(connect): add ConnectSession/ConnectOperation
 profile results

Row types for connect_sessions.csv and connect_operations.csv with derived phase
durations, status, sqlID/jobID joins, and statement-file metadata.
Issue #2065.
---
 .../profiling/ConnectProfileResults.scala     | 240 ++++++++++++++++++
 .../rapids/tool/views/OutHeaderRegistry.scala |  12 +-
 .../ConnectProfileResultsSuite.scala          | 186 ++++++++++++++
 3 files changed, 437 insertions(+), 1 deletion(-)
 create mode 100644 core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
 create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
new file mode 100644
index 000000000..60ecec2d6
--- /dev/null
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import com.nvidia.spark.rapids.tool.views.OutHeaderRegistry
+
+import org.apache.spark.sql.rapids.tool.util.StringUtils
+
+/**
+ * CSV row for a Spark Connect session. Serializes the lifecycle metadata for a
+ * single session into the columns registered under
+ * `ConnectSessionProfileResult` in [[OutHeaderRegistry]].
+ *
+ * `durationMs` is `endTime - startTime` when `endTime` is defined, else `-1`
+ * (matches the convention used for open/unfinished sessions in other result
+ * classes).
+ */
+case class ConnectSessionProfileResult(
+    appId: String,
+    sessionId: String,
+    userId: String,
+    startTime: Long,
+    endTime: Option[Long],
+    operationCount: Long) extends ProfileResult {
+
+  override def outputHeaders: Array[String] = {
+    OutHeaderRegistry.outputHeaders("ConnectSessionProfileResult")
+  }
+
+  override def convertToSeq(): Array[String] = {
+    Array(
+      appId,
+      sessionId,
+      userId,
+      startTime.toString,
+      endTime.map(_.toString).orNull,
+      endTime.map(e => (e - startTime).toString).getOrElse("-1"),
+      operationCount.toString)
+  }
+
+  override def convertToCSVSeq(): Array[String] = {
+    Array(
+      StringUtils.reformatCSVString(appId),
+      StringUtils.reformatCSVString(sessionId),
+      StringUtils.reformatCSVString(userId),
+      startTime.toString,
+      endTime.map(_.toString).orNull,
+      endTime.map(e => (e - startTime).toString).getOrElse("-1"),
+      operationCount.toString)
+  }
+}
+
+/**
+ * CSV row for a single Spark Connect operation. Captures the full lifecycle
+ * (start/analyze/readyForExec/finish/close/fail/cancel timestamps), the
+ * derived phase durations, status, producedRowCount, error message, and the
+ * joined sqlIDs/jobIDs. Also captures statement-file provenance for the
+ * separate `statements/<operationId>.txt` artifact.
+ *
+ * sqlIds and jobIds are serialized semicolon-separated (to keep the CSV
+ * single-column and avoid quoting issues).
+ */
+case class ConnectOperationProfileResult(
+    appId: String,
+    operationId: String,
+    sessionId: String,
+    userId: String,
+    jobTag: String,
+    startTime: Long,
+    analyzeTime: Option[Long],
+    readyForExecTime: Option[Long],
+    finishTime: Option[Long],
+    closeTime: Option[Long],
+    failTime: Option[Long],
+    cancelTime: Option[Long],
+    durationMs: Long,
+    analyzePhaseMs: Long,
+    planPhaseMs: Long,
+    execPhaseMs: Long,
+    resultDeliveryPhaseMs: Long,
+    status: String,
+    producedRowCount: Option[Long],
+    errorMessage: Option[String],
+    sqlIds: Seq[Long],
+    jobIds: Seq[Int],
+    statementFile: Option[String],
+    statementBytes: Long,
+    statementTruncated: Boolean) extends ProfileResult {
+
+  override def outputHeaders: Array[String] = {
+    OutHeaderRegistry.outputHeaders("ConnectOperationProfileResult")
+  }
+
+  override def convertToSeq(): Array[String] = {
+    Array(
+      appId,
+      operationId,
+      sessionId,
+      userId,
+      jobTag,
+      startTime.toString,
+      analyzeTime.map(_.toString).orNull,
+      readyForExecTime.map(_.toString).orNull,
+      finishTime.map(_.toString).orNull,
+      closeTime.map(_.toString).orNull,
+      failTime.map(_.toString).orNull,
+      cancelTime.map(_.toString).orNull,
+      durationMs.toString,
+      analyzePhaseMs.toString,
+      planPhaseMs.toString,
+      execPhaseMs.toString,
+      resultDeliveryPhaseMs.toString,
+      status,
+      producedRowCount.map(_.toString).orNull,
+      errorMessage.getOrElse(""),
+      sqlIds.mkString(";"),
+      jobIds.mkString(";"),
+      statementFile.getOrElse(""),
+      statementBytes.toString,
+      statementTruncated.toString)
+  }
+
+  override def convertToCSVSeq(): Array[String] = {
+    Array(
+      StringUtils.reformatCSVString(appId),
+      StringUtils.reformatCSVString(operationId),
+      StringUtils.reformatCSVString(sessionId),
+      StringUtils.reformatCSVString(userId),
+      StringUtils.reformatCSVString(jobTag),
+      startTime.toString,
+      analyzeTime.map(_.toString).orNull,
+      readyForExecTime.map(_.toString).orNull,
+      finishTime.map(_.toString).orNull,
+      closeTime.map(_.toString).orNull,
+      failTime.map(_.toString).orNull,
+      cancelTime.map(_.toString).orNull,
+      durationMs.toString,
+      analyzePhaseMs.toString,
+      planPhaseMs.toString,
+      execPhaseMs.toString,
+      resultDeliveryPhaseMs.toString,
+      StringUtils.reformatCSVString(status),
+      producedRowCount.map(_.toString).orNull,
+      StringUtils.reformatCSVString(errorMessage.getOrElse("")),
+      StringUtils.reformatCSVString(sqlIds.mkString(";")),
+      StringUtils.reformatCSVString(jobIds.mkString(";")),
+      StringUtils.reformatCSVString(statementFile.getOrElse("")),
+      statementBytes.toString,
+      statementTruncated.toString)
+  }
+}
+
+object ConnectOperationProfileResult {
+
+  /**
+   * Marker substring embedded by the Spark Connect server-side abbreviator when
+   * a statement/plan text exceeds its configured limit. Presence of this marker
+   * in `statementText` indicates the artifact we persist is a truncated
+   * representation of the original plan.
+   */
+  private[profiling] val TruncationMarker: String = "[truncated(size="
+
+  /**
+   * Returns `b - a` when both are defined, otherwise `-1`. Used to derive
+   * phase durations where an absent timestamp means the operation never
+   * reached that phase.
+   */
+  private def diff(a: Option[Long], b: Option[Long]): Long = {
+    (a, b) match {
+      case (Some(av), Some(bv)) => bv - av
+      case _ => -1L
+    }
+  }
+
+  /**
+   * Derives operation status from the observed lifecycle timestamps.
+   * Priority: CANCELED -> FAILED -> SUCCEEDED -> RUNNING.
+   * CANCELED precedes FAILED because server-side cancellation sometimes
+   * surfaces a trailing failure event we should not misattribute.
+   */
+  private def deriveStatus(op: ConnectOperationInfo): String = {
+    if (op.cancelTime.isDefined) "CANCELED"
+    else if (op.failTime.isDefined) "FAILED"
+    else if (op.finishTime.isDefined || op.closeTime.isDefined) "SUCCEEDED"
+    else "RUNNING"
+  }
+
+  def from(
+      appId: String,
+      op: ConnectOperationInfo,
+      sqlIds: Seq[Long],
+      jobIds: Seq[Int],
+      statementFile: Option[String]): ConnectOperationProfileResult = {
+    val endForDuration =
+      op.closeTime.orElse(op.finishTime).orElse(op.failTime).orElse(op.cancelTime)
+    val durationMs = endForDuration.map(_ - op.startTime).getOrElse(-1L)
+    val statementBytes = op.statementText.getBytes("UTF-8").length.toLong
+    val statementTruncated = op.statementText.contains(TruncationMarker)
+    ConnectOperationProfileResult(
+      appId = appId,
+      operationId = op.operationId,
+      sessionId = op.sessionId,
+      userId = op.userId,
+      jobTag = op.jobTag,
+      startTime = op.startTime,
+      analyzeTime = op.analyzeTime,
+      readyForExecTime = op.readyForExecTime,
+      finishTime = op.finishTime,
+      closeTime = op.closeTime,
+      failTime = op.failTime,
+      cancelTime = op.cancelTime,
+      durationMs = durationMs,
+      analyzePhaseMs = diff(Some(op.startTime), op.analyzeTime),
+      planPhaseMs = diff(op.analyzeTime, op.readyForExecTime),
+      execPhaseMs = diff(op.readyForExecTime, op.finishTime),
+      resultDeliveryPhaseMs = diff(op.finishTime, op.closeTime),
+      status = deriveStatus(op),
+      producedRowCount = op.producedRowCount,
+      errorMessage = op.errorMessage,
+      sqlIds = sqlIds,
+      jobIds = jobIds,
+      statementFile = statementFile,
+      statementBytes = statementBytes,
+      statementTruncated = statementTruncated)
+  }
+}
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
index 795de4c6a..80675aa7c 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
@@ -309,6 +309,16 @@ object OutHeaderRegistry {
     "WriteOpProfileResult" ->
       Array("sqlID", "sqlPlanVersion", "nodeId", "fromFinalPlan", "execName", "format",
           "location", "tableName", "dataBase", "outputColumns", "writeMode",
-          "partitionColumns", "compressionOption", "fullDescription")
+          "partitionColumns", "compressionOption", "fullDescription"),
+    "ConnectSessionProfileResult" ->
+      Array("appID", "sessionId", "userId", "startTime", "endTime", "durationMs",
+        "operationCount"),
+    "ConnectOperationProfileResult" ->
+      Array("appID", "operationId", "sessionId", "userId", "jobTag",
+        "startTime", "analyzeTime", "readyForExecTime", "finishTime", "closeTime",
+        "failTime", "cancelTime", "durationMs",
+        "analyzePhaseMs", "planPhaseMs", "execPhaseMs", "resultDeliveryPhaseMs",
+        "status", "producedRowCount", "errorMessage",
+        "sqlIds", "jobIds", "statementFile", "statementBytes", "statementTruncated")
   ) // End of outputHeaders map initialization
 }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
new file mode 100644
index 000000000..15b0ec105
--- /dev/null
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import com.nvidia.spark.rapids.tool.views.OutHeaderRegistry
+
+import org.scalatest.funsuite.AnyFunSuite
+
+class ConnectProfileResultsSuite extends AnyFunSuite {
+
+  private def operationHeaders: Array[String] =
+    OutHeaderRegistry.outputHeaders("ConnectOperationProfileResult")
+
+  private def operationCol(row: ConnectOperationProfileResult, name: String): String =
+    row.convertToSeq()(operationHeaders.indexOf(name))
+
+  test("ConnectSessionProfileResult emits correct raw columns") {
+    val row = ConnectSessionProfileResult(
+      appId = "app-1", sessionId = "s1", userId = "u1",
+      startTime = 1000L, endTime = Some(5000L), operationCount = 3)
+    assert(row.convertToSeq().toSeq ===
+      Seq("app-1", "s1", "u1", "1000", "5000", "4000", "3"))
+
+    val open = ConnectSessionProfileResult(
+      appId = "app-2", sessionId = "s2", userId = "u2",
+      startTime = 1000L, endTime = None, operationCount = 0)
+    val openSeq = open.convertToSeq()
+    // endTime is rendered as null (orNull) when absent, matching the convention
+    // used by neighboring ProfileResult classes in ProfileClassWarehouse.scala.
+    assert(openSeq(0) == "app-2")
+    assert(openSeq(1) == "s2")
+    assert(openSeq(2) == "u2")
+    assert(openSeq(3) == "1000")
+    assert(openSeq(4) == null)
+    assert(openSeq(5) == "-1")
+    assert(openSeq(6) == "0")
+  }
+
+  test("ConnectSessionProfileResult.convertToCSVSeq wraps string fields in quotes") {
+    val row = ConnectSessionProfileResult(
+      appId = "app,comma", sessionId = "s\"quote", userId = "u1",
+      startTime = 1000L, endTime = Some(5000L), operationCount = 3)
+    val csv = row.convertToCSVSeq()
+    // reformatCSVString escapes inner " by doubling and wraps the result in "..."
+    assert(csv(0) == "\"app,comma\"")
+    assert(csv(1) == "\"s\"\"quote\"")
+    assert(csv(2) == "\"u1\"")
+    // numeric fields stay raw
+    assert(csv(3) == "1000")
+    assert(csv(4) == "5000")
+    assert(csv(5) == "4000")
+    assert(csv(6) == "3")
+  }
+
+  test("OutHeaderRegistry includes ConnectSession and ConnectOperation headers") {
+    assert(OutHeaderRegistry.outputHeaders.contains("ConnectSessionProfileResult"))
+    assert(OutHeaderRegistry.outputHeaders.contains("ConnectOperationProfileResult"))
+    assert(OutHeaderRegistry.outputHeaders("ConnectSessionProfileResult").toSeq ===
+      Seq("appID", "sessionId", "userId", "startTime", "endTime", "durationMs",
+        "operationCount"))
+  }
+
+  test("row array length matches header count") {
+    val sess = ConnectSessionProfileResult("a", "s", "u", 0L, None, 0L)
+    assert(sess.convertToCSVSeq().length == sess.outputHeaders.length)
+    assert(sess.convertToSeq().length == sess.outputHeaders.length)
+
+    val op = new ConnectOperationInfo("o", "s", "u", "t", "", 0L)
+    val opRow = ConnectOperationProfileResult.from("a", op, Seq.empty, Seq.empty, None)
+    assert(opRow.convertToCSVSeq().length == opRow.outputHeaders.length)
+    assert(opRow.convertToSeq().length == opRow.outputHeaders.length)
+  }
+
+  test("ConnectOperationProfileResult derives status and phases correctly") {
+    val op = new ConnectOperationInfo(
+      operationId = "op", sessionId = "s", userId = "u",
+      jobTag = "tag", statementText = "SELECT 1", startTime = 100L)
+    op.analyzeTime = Some(200L)
+    op.readyForExecTime = Some(300L)
+    op.finishTime = Some(500L)
+    op.closeTime = Some(600L)
+    op.producedRowCount = Some(1L)
+    val row = ConnectOperationProfileResult.from(
+      appId = "app-1", op = op, sqlIds = Seq(42L), jobIds = Seq(7),
+      statementFile = Some("op.txt"))
+    assert(operationCol(row, "operationId") == "op")
+    assert(operationCol(row, "status") == "SUCCEEDED")
+    assert(operationCol(row, "durationMs") == "500")
+    assert(operationCol(row, "analyzePhaseMs") == "100")
+    assert(operationCol(row, "planPhaseMs") == "100")
+    assert(operationCol(row, "execPhaseMs") == "200")
+    assert(operationCol(row, "resultDeliveryPhaseMs") == "100")
+    assert(operationCol(row, "sqlIds") == "42")
+    assert(operationCol(row, "jobIds") == "7")
+    assert(operationCol(row, "statementFile") == "op.txt")
+    assert(operationCol(row, "statementBytes") == "8")
+    assert(operationCol(row, "statementTruncated") == "false")
+    assert(operationCol(row, "producedRowCount") == "1")
+  }
+
+  test("ConnectOperationProfileResult derives FAILED status with errorMessage") {
+    val op = new ConnectOperationInfo(
+      operationId = "op-f", sessionId = "s", userId = "u",
+      jobTag = "tag", statementText = "bad", startTime = 100L)
+    op.failTime = Some(150L)
+    op.errorMessage = Some("boom")
+    val row = ConnectOperationProfileResult.from(
+      appId = "app", op = op, sqlIds = Seq.empty, jobIds = Seq.empty,
+      statementFile = None)
+    assert(operationCol(row, "status") == "FAILED")
+    assert(operationCol(row, "errorMessage") == "boom")
+    assert(operationCol(row, "statementFile") == "")
+    assert(operationCol(row, "durationMs") == "50")
+    assert(operationCol(row, "analyzePhaseMs") == "-1")
+    assert(operationCol(row, "sqlIds") == "")
+    // producedRowCount is an Option[Long]; absent values render as null.
+    assert(operationCol(row, "producedRowCount") == null)
+  }
+
+  test("ConnectOperationProfileResult derives CANCELED status takes priority over FAILED") {
+    val op = new ConnectOperationInfo(
+      operationId = "op-c", sessionId = "s", userId = "u",
+      jobTag = "tag", statementText = "", startTime = 100L)
+    op.cancelTime = Some(150L)
+    op.failTime = Some(155L)
+    val row = ConnectOperationProfileResult.from("app", op, Seq.empty, Seq.empty, None)
+    assert(operationCol(row, "status") == "CANCELED")
+  }
+
+  test("ConnectOperationProfileResult derives RUNNING status when no terminal timestamp") {
+    val op = new ConnectOperationInfo(
+      operationId = "op-r", sessionId = "s", userId = "u",
+      jobTag = "tag", statementText = "", startTime = 100L)
+    val row = ConnectOperationProfileResult.from("app", op, Seq.empty, Seq.empty, None)
+    assert(operationCol(row, "status") == "RUNNING")
+    assert(operationCol(row, "durationMs") == "-1")
+  }
+
+  test("ConnectOperationProfileResult detects truncated statementText") {
+    val op = new ConnectOperationInfo(
+      operationId = "op-t", sessionId = "s", userId = "u",
+      jobTag = "tag",
+      statementText = "plan body ... " + ConnectOperationProfileResult.TruncationMarker +
+        "1234)] more",
+      startTime = 100L)
+    val row = ConnectOperationProfileResult.from("app", op, Seq.empty, Seq.empty, None)
+    assert(operationCol(row, "statementTruncated") == "true")
+  }
+
+  test("ConnectOperationProfileResult.convertToCSVSeq wraps string fields in quotes") {
+    val op = new ConnectOperationInfo(
+      operationId = "op,x", sessionId = "s\"q", userId = "u1",
+      jobTag = "tag", statementText = "SELECT 1", startTime = 100L)
+    op.errorMessage = Some("bad, msg")
+    val row = ConnectOperationProfileResult.from(
+      appId = "app-1", op = op, sqlIds = Seq(1L, 2L), jobIds = Seq(3, 4),
+      statementFile = Some("f,x.txt"))
+    val csv = row.convertToCSVSeq()
+    def csvCol(name: String): String = csv(operationHeaders.indexOf(name))
+    assert(csvCol("appID") == "\"app-1\"")
+    assert(csvCol("operationId") == "\"op,x\"")
+    assert(csvCol("sessionId") == "\"s\"\"q\"")
+    assert(csvCol("userId") == "\"u1\"")
+    assert(csvCol("errorMessage") == "\"bad, msg\"")
+    assert(csvCol("sqlIds") == "\"1;2\"")
+    assert(csvCol("jobIds") == "\"3;4\"")
+    assert(csvCol("statementFile") == "\"f,x.txt\"")
+    // numeric/boolean fields remain raw
+    assert(csvCol("startTime") == "100")
+    assert(csvCol("statementTruncated") == "false")
+  }
+}

From 4a79192978b68c3c894cca1adcc6da3c25253cbc Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 15:24:33 -0700
Subject: [PATCH 05/19] feat(connect): emit connect_sessions.csv and
 connect_operations.csv from profiler

Per-application. Absent (not empty) file when the app is not in Connect mode --
matches the behavior of every other per-app table. Issue #2065.
---
 .../rapids/tool/profiling/Profiler.scala      |  38 ++++
 .../ConnectProfilerOutputSuite.scala          | 208 ++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index 1534fbf2b..f2748c4dd 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -417,6 +417,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
     profileOutputWriter.writeTable(ProfRemovedBLKMgrView.getLabel, app.removedBMs)
     profileOutputWriter.writeCSVTable(ProfRemovedExecutorView.getLabel, app.removedExecutors)
     profileOutputWriter.writeCSVTable("Unsupported SQL Plan", app.unsupportedOps)
+    Profiler.writeConnectTables(profileOutputWriter, profilerResult.app)
     if (outputAlignedSQLIds) {
       profileOutputWriter.writeTable(
         ProfSQLPlanAlignedView.getLabel, app.sqlCleanedAlignedIds,
@@ -481,6 +482,43 @@ object Profiler {
   val PROFILE_LOG_NAME = "profile"
   val SUBDIR = "rapids_4_spark_profile"
 
+  /**
+   * Writes `Connect Sessions` and `Connect Operations` per-app CSV tables when
+   * the application is in Spark Connect mode. No-op otherwise: the underlying
+   * `writeCSVTable` returns early on empty input, so non-Connect apps produce
+   * no file at all (matches the behavior of every other per-app table).
+   *
+   * `statementFile` is intentionally `None` here; Task 6 of #2065 will wire the
+   * sidecar `statements/<operationId>.txt` artifact.
+   */
+  def writeConnectTables(
+      writer: ProfileOutputWriter,
+      app: ApplicationInfo): Unit = {
+    if (!app.isConnectMode) return
+    val appId = app.appId
+    val sessionRows = app.connectSessions.values.toSeq.sortBy(_.sessionId).map { s =>
+      ConnectSessionProfileResult(
+        appId = appId,
+        sessionId = s.sessionId,
+        userId = s.userId,
+        startTime = s.startTime,
+        endTime = s.endTime,
+        operationCount = app.connectOperations.values.count(_.sessionId == s.sessionId).toLong)
+    }
+    writer.writeCSVTable("Connect Sessions", sessionRows)
+    val opRows = app.connectOperations.values.toSeq.sortBy(_.operationId).map { op =>
+      ConnectOperationProfileResult.from(
+        appId = appId,
+        op = op,
+        sqlIds = app.operationIdToSqlIds.get(op.operationId)
+          .map(_.toSeq.sorted).getOrElse(Seq.empty),
+        jobIds = app.operationIdToJobIds.get(op.operationId)
+          .map(_.toSeq.sorted).getOrElse(Seq.empty),
+        statementFile = None)
+    }
+    writer.writeCSVTable("Connect Operations", opRows)
+  }
+
   def getAutoTunerResultsAsString(props: Seq[TuningEntryTrait],
       comments: Seq[RecommendedCommentResult]): String = {
     val propStr = if (props.nonEmpty) {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
new file mode 100644
index 000000000..356431337
--- /dev/null
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Path, Paths}
+import java.util.Comparator
+
+import scala.collection.mutable
+import scala.io.Source
+
+import com.nvidia.spark.rapids.BaseNoSparkSuite
+import com.nvidia.spark.rapids.tool.EventLogPathProcessor
+import com.nvidia.spark.rapids.tool.views.OutHeaderRegistry
+
+import org.apache.spark.sql.TrampolineUtil
+import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
+import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
+
+/**
+ * Tests Task 5 of Spark Connect Phase 3 (#2065): wiring
+ * [[ConnectSessionProfileResult]] / [[ConnectOperationProfileResult]] into the
+ * Profiler's per-app CSV output. Verifies that `connect_sessions.csv` and
+ * `connect_operations.csv` are produced in Connect mode and absent otherwise.
+ */
+class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
+
+  private val hadoopConf = RapidsToolsConfUtil.newHadoopConf()
+
+  private val logStartEvent =
+    """{"Event":"SparkListenerLogStart","Spark Version":"3.5.0"}"""
+  private val appStartEvent =
+    """{"Event":"SparkListenerApplicationStart","App Name":"ConnectOutputTest",""" +
+      """"App ID":"local-connect-output","Timestamp":100000,"User":"testUser"}"""
+  private val envUpdateEvent =
+    """{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{},""" +
+      """"Spark Properties":{"spark.master":"local[*]"},""" +
+      """"Hadoop Properties":{},"System Properties":{"file.encoding":"UTF-8"},""" +
+      """"Classpath Entries":{}}"""
+  private val appEndEvent =
+    """{"Event":"SparkListenerApplicationEnd","Timestamp":200000}"""
+
+  private def withEventLog(events: String*)(verify: ApplicationInfo => Unit): Unit = {
+    val content = events.mkString("\n")
+    TrampolineUtil.withTempDir { tempDir =>
+      val path = Paths.get(tempDir.getAbsolutePath, "test_eventlog")
+      Files.write(path, content.getBytes(StandardCharsets.UTF_8))
+      val app = new ApplicationInfo(hadoopConf,
+        EventLogPathProcessor.getEventLogInfo(path.toString, hadoopConf).head._1)
+      verify(app)
+    }
+  }
+
+  private def deleteRecursively(root: Path): Unit = {
+    if (Files.exists(root)) {
+      val stream = Files.walk(root)
+      try {
+        stream.sorted(Comparator.reverseOrder[Path]())
+          .forEach(p => Files.deleteIfExists(p))
+      } finally {
+        stream.close()
+      }
+    }
+  }
+
+  private def readAllLines(path: Path): Seq[String] = {
+    val src = Source.fromFile(path.toFile, StandardCharsets.UTF_8.name())
+    try {
+      src.getLines().toList
+    } finally {
+      src.close()
+    }
+  }
+
+  test("writeConnectTables emits connect_sessions.csv and connect_operations.csv " +
+    "when isConnectMode") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      // Seed two sessions and two operations (one SUCCEEDED, one FAILED) plus
+      // sqlID / jobID correlations, so both result types exercise their
+      // convertToCSVSeq paths.
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+
+      val op1 = new ConnectOperationInfo(
+        operationId = "op-1",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1",
+        statementText = "range(0, 10)",
+        startTime = 110L,
+        analyzeTime = Some(120L),
+        readyForExecTime = Some(130L),
+        finishTime = Some(150L),
+        closeTime = Some(160L),
+        producedRowCount = Some(10L))
+      app.connectOperations.put("op-1", op1)
+      app.operationIdToSqlIds.put("op-1", mutable.HashSet(42L))
+      app.operationIdToJobIds.put("op-1", mutable.HashSet(7))
+
+      val op2 = new ConnectOperationInfo(
+        operationId = "op-2",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-2",
+        statementText = "range(0, 5)",
+        startTime = 200L,
+        failTime = Some(260L),
+        errorMessage = Some("boom"))
+      app.connectOperations.put("op-2", op2)
+
+      assert(app.isConnectMode, "Seeded app should report Connect mode")
+
+      val tmpDir = Files.createTempDirectory("prof-connect-out-").toFile
+      try {
+        val writer = new ProfileOutputWriter(tmpDir.getAbsolutePath, "profile",
+          numOutputRows = 1000, outputCSV = true)
+        try {
+          Profiler.writeConnectTables(writer, app)
+        } finally {
+          writer.close()
+        }
+
+        val sessionsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_sessions.csv")
+        val operationsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_operations.csv")
+        assert(Files.exists(sessionsCsv), s"expected $sessionsCsv to exist")
+        assert(Files.exists(operationsCsv), s"expected $operationsCsv to exist")
+
+        val sessionLines = readAllLines(sessionsCsv)
+        // Header + 1 session row
+        assert(sessionLines.size == 2, s"unexpected session rows: $sessionLines")
+        assert(sessionLines.head ==
+          "appID,sessionId,userId,startTime,endTime,durationMs,operationCount",
+          s"unexpected session header: ${sessionLines.head}")
+
+        val opLines = readAllLines(operationsCsv)
+        // Header + 2 op rows
+        assert(opLines.size == 3, s"unexpected operation rows: $opLines")
+        assert(opLines.head ==
+          "appID,operationId,sessionId,userId,jobTag,startTime,analyzeTime," +
+            "readyForExecTime,finishTime,closeTime,failTime,cancelTime,durationMs," +
+            "analyzePhaseMs,planPhaseMs,execPhaseMs,resultDeliveryPhaseMs,status," +
+            "producedRowCount,errorMessage,sqlIds,jobIds,statementFile,statementBytes," +
+            "statementTruncated",
+          s"unexpected operation header: ${opLines.head}")
+        // Per-column parse: find the `status` column index from the registry and
+        // assert exactly one SUCCEEDED and one FAILED row. Rows in this test case
+        // contain no embedded commas, so simple string split is sufficient.
+        // CSV string columns are wrapped in double quotes by reformatCSVString;
+        // strip surrounding quotes before comparison.
+        val opHeaders = OutHeaderRegistry.outputHeaders("ConnectOperationProfileResult")
+        val statusIdx = opHeaders.indexOf("status")
+        assert(statusIdx >= 0, s"status column missing from registry headers: ${
+          opHeaders.mkString(",")}")
+        val statusValues = opLines.tail.map(_.split(",", -1)(statusIdx).stripPrefix("\"")
+          .stripSuffix("\""))
+        assert(statusValues.count(_ == "SUCCEEDED") == 1,
+          s"expected exactly one SUCCEEDED row: $statusValues")
+        assert(statusValues.count(_ == "FAILED") == 1,
+          s"expected exactly one FAILED row: $statusValues")
+      } finally {
+        deleteRecursively(tmpDir.toPath)
+      }
+    }
+  }
+
+  test("writeConnectTables writes no files when app is not in Connect mode") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      assert(!app.isConnectMode, "Fresh app should not be in Connect mode")
+
+      val tmpDir = Files.createTempDirectory("prof-connect-out-").toFile
+      try {
+        val writer = new ProfileOutputWriter(tmpDir.getAbsolutePath, "profile",
+          numOutputRows = 1000, outputCSV = true)
+        try {
+          Profiler.writeConnectTables(writer, app)
+        } finally {
+          writer.close()
+        }
+
+        val sessionsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_sessions.csv")
+        val operationsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_operations.csv")
+        assert(!Files.exists(sessionsCsv),
+          s"expected no $sessionsCsv for non-Connect app")
+        assert(!Files.exists(operationsCsv),
+          s"expected no $operationsCsv for non-Connect app")
+      } finally {
+        deleteRecursively(tmpDir.toPath)
+      }
+    }
+  }
+}

From 227c206c14e66cd71a8821b70ccf245cfa5eba53 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 16:27:12 -0700
Subject: [PATCH 06/19] feat(connect): write statementText sidecar files per
 Connect operation

Keeps large protobuf-text plans out of connect_operations.csv. Files land under
connect_statements/<operationId>.txt, basename referenced in the statementFile
column of connect_operations.csv. Directory is not created when there are no
operations with non-empty statementText. Issue #2065.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../profiling/ConnectStatementWriter.scala    |  78 +++++++++++
 .../tool/profiling/ProfileOutputWriter.scala  |   2 +-
 .../rapids/tool/profiling/Profiler.scala      |  10 +-
 .../ConnectProfilerOutputSuite.scala          |  37 +++++-
 .../ConnectStatementWriterSuite.scala         | 124 ++++++++++++++++++
 5 files changed, 245 insertions(+), 6 deletions(-)
 create mode 100644 core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
 create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
new file mode 100644
index 000000000..a201fb282
--- /dev/null
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
+
+import org.apache.spark.internal.Logging
+
+/**
+ * Writes per-operation Spark Connect `statementText` payloads to sidecar files
+ * so large protobuf-text plans do not inflate the `connect_operations.csv`
+ * table. Files are written under `<rootDir>/connect_statements/<opId>.txt` and
+ * the returned map records the basenames for inclusion in the operation CSV
+ * `statementFile` column.
+ *
+ * Operations with empty `statementText` are skipped entirely (no file, no map
+ * entry). The `connect_statements` subdirectory is created lazily on the first
+ * non-empty statement, so apps with no statements at all do not produce an
+ * empty directory. Per-file IO errors are logged and skipped; they do not
+ * abort the batch.
+ */
+object ConnectStatementWriter extends Logging {
+
+  val SUB_DIR: String = "connect_statements"
+  val FILE_EXTENSION: String = ".txt"
+
+  /**
+   * Writes each operation's `statementText` to
+   * `<rootDir>/connect_statements/<operationId>.txt` when non-empty.
+   *
+   * @param rootDir per-app output directory (already exists)
+   * @param ops     operations to persist
+   * @return map of `operationId -> "<operationId>.txt"` basenames for the
+   *         operations whose sidecar file was written successfully.
+   */
+  def writeStatementFiles(
+      rootDir: String,
+      ops: Iterable[ConnectOperationInfo]): Map[String, String] = {
+    val subDirPath = Paths.get(rootDir, SUB_DIR)
+    var subDirCreated = false
+    val builder = Map.newBuilder[String, String]
+    ops.foreach { op =>
+      val text = op.statementText
+      if (text.nonEmpty) {
+        try {
+          if (!subDirCreated) {
+            Files.createDirectories(subDirPath)
+            subDirCreated = true
+          }
+          val basename = s"${op.operationId}$FILE_EXTENSION"
+          val target = subDirPath.resolve(basename)
+          Files.write(target, text.getBytes(StandardCharsets.UTF_8))
+          builder += (op.operationId -> basename)
+        } catch {
+          case e: Exception =>
+            logWarning(s"Failed to write Connect statement sidecar for operation " +
+              s"${op.operationId} under $subDirPath", e)
+        }
+      }
+    }
+    builder.result()
+  }
+}
\ No newline at end of file
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
index 1bb6f6dc3..4b2046b75 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
@@ -26,7 +26,7 @@ import org.json4s.jackson.Serialization
  * In case the outputCSV is set to true, it will write each table to a
  * separate CSV file.
  */
-class ProfileOutputWriter(outputDir: String, filePrefix: String, numOutputRows: Int,
+class ProfileOutputWriter(val outputDir: String, filePrefix: String, numOutputRows: Int,
     outputCSV: Boolean = false) {
 
   implicit val formats: DefaultFormats.type = DefaultFormats
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index f2748c4dd..8e7f6dbae 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -488,8 +488,9 @@ object Profiler {
    * `writeCSVTable` returns early on empty input, so non-Connect apps produce
    * no file at all (matches the behavior of every other per-app table).
    *
-   * `statementFile` is intentionally `None` here; Task 6 of #2065 will wire the
-   * sidecar `statements/<operationId>.txt` artifact.
+   * Each operation's `statementText` is written to a sidecar file under
+   * `<perAppDir>/connect_statements/<operationId>.txt` and the basename is
+   * recorded in the `statementFile` column of `connect_operations.csv`.
    */
   def writeConnectTables(
       writer: ProfileOutputWriter,
@@ -506,6 +507,9 @@ object Profiler {
         operationCount = app.connectOperations.values.count(_.sessionId == s.sessionId).toLong)
     }
     writer.writeCSVTable("Connect Sessions", sessionRows)
+    val statementFiles: Map[String, String] =
+      ConnectStatementWriter.writeStatementFiles(
+        writer.outputDir, app.connectOperations.values)
     val opRows = app.connectOperations.values.toSeq.sortBy(_.operationId).map { op =>
       ConnectOperationProfileResult.from(
         appId = appId,
@@ -514,7 +518,7 @@ object Profiler {
           .map(_.toSeq.sorted).getOrElse(Seq.empty),
         jobIds = app.operationIdToJobIds.get(op.operationId)
           .map(_.toSeq.sorted).getOrElse(Seq.empty),
-        statementFile = None)
+        statementFile = statementFiles.get(op.operationId))
     }
     writer.writeCSVTable("Connect Operations", opRows)
   }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
index 356431337..f268786a2 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -98,12 +98,13 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
         startTime = 100L,
         endTime = Some(500L)))
 
+      val op1StatementText = "SELECT 1 plan body"
       val op1 = new ConnectOperationInfo(
         operationId = "op-1",
         sessionId = "sess-1",
         userId = "alice",
         jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1",
-        statementText = "range(0, 10)",
+        statementText = op1StatementText,
         startTime = 110L,
         analyzeTime = Some(120L),
         readyForExecTime = Some(130L),
@@ -119,7 +120,7 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
         sessionId = "sess-1",
         userId = "alice",
         jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-2",
-        statementText = "range(0, 5)",
+        statementText = "",
         startTime = 200L,
         failTime = Some(260L),
         errorMessage = Some("boom"))
@@ -174,6 +175,38 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
           s"expected exactly one SUCCEEDED row: $statusValues")
         assert(statusValues.count(_ == "FAILED") == 1,
           s"expected exactly one FAILED row: $statusValues")
+
+        // Task 6: verify sidecar statement files and the statementFile column.
+        // op-1 had non-empty statementText -> sidecar exists and basename appears
+        //   in the CSV. op-2 had empty statementText -> no sidecar and empty cell.
+        val statementsDir = Paths.get(tmpDir.getAbsolutePath,
+          ConnectStatementWriter.SUB_DIR)
+        assert(Files.isDirectory(statementsDir),
+          s"expected $statementsDir directory for op-1 sidecar")
+        val op1Sidecar = statementsDir.resolve("op-1.txt")
+        val op2Sidecar = statementsDir.resolve("op-2.txt")
+        assert(Files.exists(op1Sidecar), s"expected $op1Sidecar to exist")
+        assert(!Files.exists(op2Sidecar),
+          s"expected $op2Sidecar not to exist for empty statementText")
+        val op1Contents = new String(Files.readAllBytes(op1Sidecar),
+          StandardCharsets.UTF_8)
+        assert(op1Contents == op1StatementText,
+          s"sidecar contents mismatch: $op1Contents vs $op1StatementText")
+        val opIdIdx = opHeaders.indexOf("operationId")
+        val statementFileIdx = opHeaders.indexOf("statementFile")
+        assert(statementFileIdx >= 0,
+          s"statementFile column missing from registry headers: ${
+            opHeaders.mkString(",")}")
+        val stmtFileByOp = opLines.tail.map { line =>
+          val cols = line.split(",", -1)
+          val opId = cols(opIdIdx).stripPrefix("\"").stripSuffix("\"")
+          val stmtFile = cols(statementFileIdx).stripPrefix("\"").stripSuffix("\"")
+          opId -> stmtFile
+        }.toMap
+        assert(stmtFileByOp("op-1") == "op-1.txt",
+          s"expected op-1 statementFile=op-1.txt, got ${stmtFileByOp("op-1")}")
+        assert(stmtFileByOp("op-2") == "",
+          s"expected op-2 statementFile empty, got ${stmtFileByOp("op-2")}")
       } finally {
         deleteRecursively(tmpDir.toPath)
       }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
new file mode 100644
index 000000000..e472e89fc
--- /dev/null
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.profiling
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Path, Paths}
+import java.util.Comparator
+
+import org.scalatest.funsuite.AnyFunSuite
+
+/**
+ * Tests Task 6 of Spark Connect Phase 3 (#2065): writing each operation's
+ * `statementText` to a sidecar file under
+ * `<per-app-output-dir>/connect_statements/<operationId>.txt` and returning
+ * the basename map used to populate the `statementFile` column in
+ * `connect_operations.csv`.
+ */
+class ConnectStatementWriterSuite extends AnyFunSuite {
+
+  private def deleteRecursively(root: Path): Unit = {
+    if (Files.exists(root)) {
+      val stream = Files.walk(root)
+      try {
+        stream.sorted(Comparator.reverseOrder[Path]())
+          .forEach(p => Files.deleteIfExists(p))
+      } finally {
+        stream.close()
+      }
+    }
+  }
+
+  private def makeOp(opId: String, stmt: String): ConnectOperationInfo = {
+    new ConnectOperationInfo(
+      operationId = opId,
+      sessionId = "sess-1",
+      userId = "alice",
+      jobTag = s"SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_$opId",
+      statementText = stmt,
+      startTime = 100L)
+  }
+
+  test("writes non-empty statementText to sidecar file and returns basename") {
+    val tmpDir = Files.createTempDirectory("connect-stmt-writer-")
+    try {
+      val op = makeOp("op-1", "range(0, 10)")
+      val result = ConnectStatementWriter.writeStatementFiles(
+        tmpDir.toString, Seq(op))
+      assert(result == Map("op-1" -> "op-1.txt"),
+        s"expected map op-1 -> op-1.txt, got $result")
+      val expectedPath = tmpDir.resolve(ConnectStatementWriter.SUB_DIR).resolve("op-1.txt")
+      assert(Files.exists(expectedPath), s"expected sidecar at $expectedPath")
+      val written = new String(Files.readAllBytes(expectedPath), StandardCharsets.UTF_8)
+      assert(written == "range(0, 10)", s"unexpected content: $written")
+    } finally {
+      deleteRecursively(tmpDir)
+    }
+  }
+
+  test("empty statementText is skipped and not in returned map") {
+    val tmpDir = Files.createTempDirectory("connect-stmt-writer-")
+    try {
+      val op1 = makeOp("op-1", "plan body")
+      val op2 = makeOp("op-2", "")
+      val result = ConnectStatementWriter.writeStatementFiles(
+        tmpDir.toString, Seq(op1, op2))
+      assert(result.keySet == Set("op-1"),
+        s"expected only op-1 in map, got ${result.keySet}")
+      assert(result("op-1") == "op-1.txt")
+      val op2Path = tmpDir.resolve(ConnectStatementWriter.SUB_DIR).resolve("op-2.txt")
+      assert(!Files.exists(op2Path), s"op-2 sidecar should not exist: $op2Path")
+    } finally {
+      deleteRecursively(tmpDir)
+    }
+  }
+
+  test("does not create connect_statements dir when all statementText empty") {
+    val tmpDir = Files.createTempDirectory("connect-stmt-writer-")
+    try {
+      val op1 = makeOp("op-1", "")
+      val op2 = makeOp("op-2", "")
+      val result = ConnectStatementWriter.writeStatementFiles(
+        tmpDir.toString, Seq(op1, op2))
+      assert(result.isEmpty, s"expected empty map, got $result")
+      val subDir = tmpDir.resolve(ConnectStatementWriter.SUB_DIR)
+      assert(!Files.exists(subDir),
+        s"sidecar directory should not have been created: $subDir")
+    } finally {
+      deleteRecursively(tmpDir)
+    }
+  }
+
+  test("Unicode / multi-byte statementText roundtrips through UTF-8") {
+    val tmpDir = Files.createTempDirectory("connect-stmt-writer-")
+    try {
+      val unicode = "SELECT 'λ', '漢字', '🚀' FROM t"
+      val op = makeOp("op-1", unicode)
+      val result = ConnectStatementWriter.writeStatementFiles(
+        tmpDir.toString, Seq(op))
+      assert(result == Map("op-1" -> "op-1.txt"))
+      val path = Paths.get(tmpDir.toString, ConnectStatementWriter.SUB_DIR, "op-1.txt")
+      val bytes = Files.readAllBytes(path)
+      assert(bytes.sameElements(unicode.getBytes(StandardCharsets.UTF_8)),
+        "bytes on disk should match UTF-8 encoded original")
+      val roundtrip = new String(bytes, StandardCharsets.UTF_8)
+      assert(roundtrip == unicode, s"roundtrip mismatch: $roundtrip vs $unicode")
+    } finally {
+      deleteRecursively(tmpDir)
+    }
+  }
+}

From a68c9ce87474b1bbf4bd352dae81013f53eb4ef8 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:20:48 -0700
Subject: [PATCH 07/19] feat(connect): emit connect CSVs and statement sidecars
 from qualification

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../rapids/tool/profiling/Profiler.scala      |   2 +-
 .../tool/views/QualRawReportGenerator.scala   |   3 +-
 .../QualificationConnectOutputSuite.scala     | 155 ++++++++++++++++++
 3 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index 8e7f6dbae..aa976e53f 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -494,7 +494,7 @@ object Profiler {
    */
   def writeConnectTables(
       writer: ProfileOutputWriter,
-      app: ApplicationInfo): Unit = {
+      app: AppBase): Unit = {
     if (!app.isConnectMode) return
     val appId = app.appId
     val sessionRows = app.connectSessions.values.toSeq.sortBy(_.sessionId).map { s =>
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
index 604fbfa7b..df359f19b 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
@@ -17,7 +17,7 @@
 package com.nvidia.spark.rapids.tool.views
 
 import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator}
-import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults}
+import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, Profiler, SQLAccumProfileResults}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
@@ -113,6 +113,7 @@ object QualRawReportGenerator extends Logging {
         QualRemovedExecutorView.getLabel, QualRemovedExecutorView.getRawView(Seq(app)))
       // we only need to write the CSV report of the WriteOps
       pWriter.writeCSVTable(QualWriteOpsView.getLabel, QualWriteOpsView.getRawView(Seq(app)))
+      Profiler.writeConnectTables(pWriter, app)
     } catch {
       case e: Exception =>
         logError(s"Error generating raw metrics for ${app.appId}: ${e.getMessage}")
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
new file mode 100644
index 000000000..732c0f0f9
--- /dev/null
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.tool.qualification
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Path, Paths}
+import java.util.Comparator
+
+import scala.collection.mutable
+import scala.io.Source
+
+import com.nvidia.spark.rapids.BaseNoSparkSuite
+import com.nvidia.spark.rapids.tool.profiling.{ConnectOperationInfo, ConnectSessionInfo}
+import com.nvidia.spark.rapids.tool.views.QualRawReportGenerator
+
+import org.apache.spark.sql.TrampolineUtil
+import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
+
+/**
+ * Verifies that the qualification raw-metrics writer emits the same Spark
+ * Connect tables and statement sidecars as profiling, but under
+ * `raw_metrics/<appId>/`.
+ */
+class QualificationConnectOutputSuite extends BaseNoSparkSuite {
+
+  private val logStartEvent =
+    """{"Event":"SparkListenerLogStart","Spark Version":"3.5.0"}"""
+  private val appStartEvent =
+    """{"Event":"SparkListenerApplicationStart","App Name":"QualConnectOutputTest",""" +
+      """"App ID":"local-qual-connect-output","Timestamp":100000,"User":"testUser"}"""
+  private val envUpdateEvent =
+    """{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{},""" +
+      """"Spark Properties":{"spark.master":"local[*]"},""" +
+      """"Hadoop Properties":{},"System Properties":{"file.encoding":"UTF-8"},""" +
+      """"Classpath Entries":{}}"""
+  private val appEndEvent =
+    """{"Event":"SparkListenerApplicationEnd","Timestamp":200000}"""
+
+  private def withQualificationApp(events: String*)(verify: QualificationAppInfo => Unit): Unit = {
+    val content = events.mkString("\n")
+    TrampolineUtil.withTempDir { tempDir =>
+      val path = Paths.get(tempDir.getAbsolutePath, "test_eventlog")
+      Files.write(path, content.getBytes(StandardCharsets.UTF_8))
+      val app = createAppFromEventlog(path.toString)
+      verify(app)
+    }
+  }
+
+  private def deleteRecursively(root: Path): Unit = {
+    if (Files.exists(root)) {
+      val stream = Files.walk(root)
+      try {
+        stream.sorted(Comparator.reverseOrder[Path]())
+          .forEach(p => Files.deleteIfExists(p))
+      } finally {
+        stream.close()
+      }
+    }
+  }
+
+  private def readAllLines(path: Path): Seq[String] = {
+    val src = Source.fromFile(path.toFile, StandardCharsets.UTF_8.name())
+    try {
+      src.getLines().toList
+    } finally {
+      src.close()
+    }
+  }
+
+  test("qualification raw metrics emit connect CSVs and statement sidecars") {
+    withQualificationApp(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+
+      val op1StatementText = "SELECT 1 plan body"
+      app.connectOperations.put("op-1", new ConnectOperationInfo(
+        operationId = "op-1",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1",
+        statementText = op1StatementText,
+        startTime = 110L,
+        analyzeTime = Some(120L),
+        readyForExecTime = Some(130L),
+        finishTime = Some(150L),
+        closeTime = Some(160L),
+        producedRowCount = Some(10L)))
+      app.operationIdToSqlIds.put("op-1", mutable.HashSet(42L))
+      app.operationIdToJobIds.put("op-1", mutable.HashSet(7))
+
+      app.connectOperations.put("op-2", new ConnectOperationInfo(
+        operationId = "op-2",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-2",
+        statementText = "",
+        startTime = 200L,
+        failTime = Some(260L),
+        errorMessage = Some("boom")))
+
+      val tmpDir = Files.createTempDirectory("qual-connect-out-")
+      try {
+        QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(tmpDir.toString, app)
+
+        val appDir = tmpDir.resolve("raw_metrics").resolve(app.appId)
+        val sessionsCsv = appDir.resolve("connect_sessions.csv")
+        val operationsCsv = appDir.resolve("connect_operations.csv")
+        assert(Files.exists(sessionsCsv), s"expected $sessionsCsv to exist")
+        assert(Files.exists(operationsCsv), s"expected $operationsCsv to exist")
+
+        val sessionLines = readAllLines(sessionsCsv)
+        assert(sessionLines.size == 2, s"unexpected session rows: $sessionLines")
+        assert(sessionLines.head ==
+          "appID,sessionId,userId,startTime,endTime,durationMs,operationCount",
+          s"unexpected session header: ${sessionLines.head}")
+
+        val opLines = readAllLines(operationsCsv)
+        assert(opLines.size == 3, s"unexpected operation rows: $opLines")
+        assert(opLines.head.contains("statementFile"),
+          s"connect_operations header should include statementFile: ${opLines.head}")
+
+        val statementsDir = appDir.resolve("connect_statements")
+        assert(Files.isDirectory(statementsDir),
+          s"expected $statementsDir directory for op-1 sidecar")
+        val op1Sidecar = statementsDir.resolve("op-1.txt")
+        val op2Sidecar = statementsDir.resolve("op-2.txt")
+        assert(Files.exists(op1Sidecar), s"expected $op1Sidecar to exist")
+        assert(!Files.exists(op2Sidecar),
+          s"expected $op2Sidecar not to exist for empty statementText")
+        val op1Contents = new String(Files.readAllBytes(op1Sidecar), StandardCharsets.UTF_8)
+        assert(op1Contents == op1StatementText,
+          s"sidecar contents mismatch: $op1Contents vs $op1StatementText")
+      } finally {
+        deleteRecursively(tmpDir)
+      }
+    }
+  }
+}

From 6457e395e752f04ae6ec6508a73b30e1b9e391c8 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:24:02 -0700
Subject: [PATCH 08/19] feat(connect): register connectReport in qual and prof
 YAML catalogs

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../configs/reports/connectReport.yaml        | 142 ++++++++++++++++++
 .../configs/reports/profCoreReport.yaml       |   1 +
 .../configs/reports/qualCoreReport.yaml       |   2 +
 .../api_v1/report_loader.py                   |   3 +-
 user_tools/src/spark_rapids_tools/enums.py    |   2 +
 .../api/test_connect_report_loader.py         |  75 +++++++++
 6 files changed, 224 insertions(+), 1 deletion(-)
 create mode 100644 core/src/main/resources/configs/reports/connectReport.yaml
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/api/test_connect_report_loader.py

diff --git a/core/src/main/resources/configs/reports/connectReport.yaml b/core/src/main/resources/configs/reports/connectReport.yaml
new file mode 100644
index 000000000..7a2e12c50
--- /dev/null
+++ b/core/src/main/resources/configs/reports/connectReport.yaml
@@ -0,0 +1,142 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Spark Connect session / operation report written per application by both the
+# profiling and qualification tools.
+
+reportDefinitions:
+  - reportId: connectReport
+    description: >-
+      Per-application Spark Connect session and operation metadata plus a
+      directory of per-operation protobuf-text statement payloads.
+    scope: per-app
+    tableDefinitions:
+      - label: connectSessions
+        description: >-
+          One row per Spark Connect session observed in the event log.
+        fileName: connect_sessions.csv
+        scope: per-app
+        columns:
+          - name: appID
+            dataType: String
+            description: Application ID.
+          - name: sessionId
+            dataType: String
+            description: UUID of the Spark Connect session.
+          - name: userId
+            dataType: String
+            description: User who owns the session.
+          - name: startTime
+            dataType: Long
+            description: Epoch millis when the session started.
+          - name: endTime
+            dataType: Long
+            description: Epoch millis when the session closed, if observed.
+          - name: durationMs
+            dataType: Long
+            description: Session duration in milliseconds, or -1 if still open.
+          - name: operationCount
+            dataType: Long
+            description: Number of operations observed in the session.
+      - label: connectOperations
+        description: >-
+          One row per Spark Connect operation with lifecycle timestamps,
+          derived phase durations, linked SQL/job identifiers, and statement
+          sidecar metadata.
+        fileName: connect_operations.csv
+        scope: per-app
+        columns:
+          - name: appID
+            dataType: String
+            description: Application ID.
+          - name: operationId
+            dataType: String
+            description: UUID of the Spark Connect operation.
+          - name: sessionId
+            dataType: String
+            description: Session UUID that owns the operation.
+          - name: userId
+            dataType: String
+            description: User who submitted the operation.
+          - name: jobTag
+            dataType: String
+            description: Correlation tag shared with SQLExecutionStart and JobStart events.
+          - name: startTime
+            dataType: Long
+            description: Epoch millis when the operation started.
+          - name: analyzeTime
+            dataType: Long
+            description: Epoch millis when analysis completed, if observed.
+          - name: readyForExecTime
+            dataType: Long
+            description: Epoch millis when planning completed, if observed.
+          - name: finishTime
+            dataType: Long
+            description: Epoch millis when execution completed, if observed.
+          - name: closeTime
+            dataType: Long
+            description: Epoch millis when the operation closed, if observed.
+          - name: failTime
+            dataType: Long
+            description: Epoch millis when the operation failed, if observed.
+          - name: cancelTime
+            dataType: Long
+            description: Epoch millis when the operation was canceled, if observed.
+          - name: durationMs
+            dataType: Long
+            description: Derived end-to-end operation duration in milliseconds.
+          - name: analyzePhaseMs
+            dataType: Long
+            description: Time between start and analyze events, or -1.
+          - name: planPhaseMs
+            dataType: Long
+            description: Time between analyze and ready-for-execution events, or -1.
+          - name: execPhaseMs
+            dataType: Long
+            description: Time between ready-for-execution and finish events, or -1.
+          - name: resultDeliveryPhaseMs
+            dataType: Long
+            description: Time between finish and close events, or -1.
+          - name: status
+            dataType: String
+            description: Derived operation status (RUNNING, SUCCEEDED, FAILED, CANCELED).
+          - name: producedRowCount
+            dataType: Long
+            description: Row count reported by the finish event, if present.
+          - name: errorMessage
+            dataType: String
+            description: Error message reported by a failure event, if present.
+          - name: sqlIds
+            dataType: String
+            description: Semicolon-separated SQL execution IDs correlated to the operation.
+          - name: jobIds
+            dataType: String
+            description: Semicolon-separated Spark job IDs correlated to the operation.
+          - name: statementFile
+            dataType: String
+            description: Sidecar filename under connect_statements/ for this operation, if written.
+          - name: statementBytes
+            dataType: Long
+            description: UTF-8 byte length of statementText.
+          - name: statementTruncated
+            dataType: Boolean
+            description: True when statementText includes Spark's truncation marker.
+      - label: connectStatements
+        description: >-
+          Directory of per-operation statementText sidecars. Each
+          <operationId>.txt file contains the protobuf debug-format text of the
+          Connect operation statement.
+        fileName: connect_statements
+        fileFormat: DIRECTORY
+        scope: per-app
diff --git a/core/src/main/resources/configs/reports/profCoreReport.yaml b/core/src/main/resources/configs/reports/profCoreReport.yaml
index c551036ea..dd0d589dc 100644
--- a/core/src/main/resources/configs/reports/profCoreReport.yaml
+++ b/core/src/main/resources/configs/reports/profCoreReport.yaml
@@ -32,6 +32,7 @@ reportDefinitions:
     scope: global
     nestedReports:
       - reportId: coreRawMetrics
+      - reportId: connectReport
     tableDefinitions:
       # AppStatusResult
       - label: coreCSVStatus
diff --git a/core/src/main/resources/configs/reports/qualCoreReport.yaml b/core/src/main/resources/configs/reports/qualCoreReport.yaml
index e9c383b6d..b82d39170 100644
--- a/core/src/main/resources/configs/reports/qualCoreReport.yaml
+++ b/core/src/main/resources/configs/reports/qualCoreReport.yaml
@@ -64,6 +64,8 @@ reportDefinitions:
         relativePath: qual_metrics
       - reportId: coreRawMetrics
         relativePath: raw_metrics
+      - reportId: connectReport
+        relativePath: raw_metrics
       - reportId: qualTuningApps
         relativePath: tuning_apps
     tableDefinitions:
diff --git a/user_tools/src/spark_rapids_tools/api_v1/report_loader.py b/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
index ed0176ffd..aba2a7b01 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
@@ -134,7 +134,8 @@ def core_report_definitions(self) -> List[str]:
         return [
             f'{self.core_report_dir}/qualCoreReport.yaml',
             f'{self.core_report_dir}/profCoreReport.yaml',
-            f'{self.core_report_dir}/coreRawMetricsReport.yaml'
+            f'{self.core_report_dir}/coreRawMetricsReport.yaml',
+            f'{self.core_report_dir}/connectReport.yaml'
         ]
 
     @property
diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py
index 5e0ddf05c..16e059cac 100644
--- a/user_tools/src/spark_rapids_tools/enums.py
+++ b/user_tools/src/spark_rapids_tools/enums.py
@@ -257,6 +257,7 @@ class ReportTableFormat(EnumeratedType):
     """Values used to define the format of the report tables"""
     CSV = 'csv'
     CONF = 'conf'
+    DIRECTORY = 'directory'
     JSON = 'json'
     LOG = 'log'
     PROPERTIES = 'properties'
@@ -286,6 +287,7 @@ def compatible(self, candidate: Union[str, 'ReportTableFormat']) -> bool:
             self.CSV: [self.TXT],
             self.JSON: [self.TXT, self.CSV],  # It is possible to convert JSON to CSV (pandas normalizes JSON)
             self.CONF: [self.PROPERTIES],
+            self.DIRECTORY: [],
             self.PROPERTIES: [self.TXT],
             self.TXT: []
         }
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_report_loader.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_report_loader.py
new file mode 100644
index 000000000..78ae64281
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_report_loader.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Smoke tests for Connect report discovery through YAML catalogs."""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from spark_rapids_tools.api_v1 import ProfCore, QualCore
+
+
+class TestConnectReportLoader(unittest.TestCase):
+    """Verifies connectReport is discoverable from prof/qual result handlers."""
+
+    sample_app_id = 'application_1234567890_0001'
+
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp()
+        self.prof_output = os.path.join(self.temp_dir, 'rapids_4_spark_profile')
+        self.qual_output = os.path.join(self.temp_dir, 'qual_core_output')
+        os.makedirs(self.prof_output, exist_ok=True)
+        os.makedirs(self.qual_output, exist_ok=True)
+        self._write_prof_status()
+        self._write_qual_status()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def _write_prof_status(self):
+        status_csv = os.path.join(self.prof_output, 'profiling_status.csv')
+        with open(status_csv, 'w', encoding='utf-8') as fh:
+            fh.write('Event Log,Status,App ID,Attempt ID,App Name,Description\n')
+            fh.write(f'/path/to/eventlog,SUCCESS,{self.sample_app_id},0,ProfTest,ok\n')
+
+    def _write_qual_status(self):
+        status_csv = os.path.join(self.qual_output, 'status.csv')
+        with open(status_csv, 'w', encoding='utf-8') as fh:
+            fh.write('Event Log,Status,App ID,Attempt ID,App Name,Description\n')
+            fh.write(f'/path/to/eventlog,SUCCESS,{self.sample_app_id},0,QualTest,ok\n')
+
+    def test_prof_core_registers_connect_tables(self):
+        handler = ProfCore(self.prof_output).handler
+
+        for label in ('connectSessions', 'connectOperations', 'connectStatements'):
+            self.assertIn(label, handler.tbl_reader_map)
+            self.assertTrue(handler.is_per_app_tbl(label))
+
+        reader = handler.get_reader_by_tbl('connectStatements')
+        self.assertIsNotNone(reader)
+        self.assertEqual(reader.report_id, 'connectReport')
+
+    def test_qual_core_registers_connect_tables_under_raw_metrics(self):
+        handler = QualCore(self.qual_output).handler
+
+        for label in ('connectSessions', 'connectOperations', 'connectStatements'):
+            self.assertIn(label, handler.tbl_reader_map)
+            self.assertTrue(handler.is_per_app_tbl(label))
+
+        reader = handler.get_reader_by_tbl('connectStatements')
+        self.assertIsNotNone(reader)
+        self.assertEqual(reader.report_id, 'connectReport')
+        self.assertTrue(str(reader.out_path).endswith('/raw_metrics'))

From 4e96bdc30056c33c25676c451d56af61be20a9a2 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:25:19 -0700
Subject: [PATCH 09/19] feat(connect): Python API helpers for listing and
 reading statement files

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../src/spark_rapids_tools/api_v1/builder.py  |  9 +++
 .../api_v1/result_handler.py                  | 59 ++++++++++++++++
 .../api/test_connect_helpers.py               | 68 +++++++++++++++++++
 3 files changed, 136 insertions(+)
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py

diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 0e706d313..0c560d7aa 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -1172,6 +1172,15 @@ def is_empty(self) -> bool:
     def get_raw_metrics_path(self) -> Optional[BoundedCspPath]:
         return self._res_h.get_raw_metrics_path()
 
+    def get_connect_statements_dir(self, app_id: str) -> Optional[BoundedCspPath]:
+        return self._res_h.get_connect_statements_dir(app_id)
+
+    def list_connect_statement_ops(self, app_id: str) -> List[str]:
+        return self._res_h.list_connect_statement_ops(app_id)
+
+    def load_connect_statement(self, app_id: str, operation_id: str) -> Optional[str]:
+        return self._res_h.load_connect_statement(app_id, operation_id)
+
 
 @dataclass
 class ProfWrapper(APIResHandler[ProfWrapperResultHandler]):
diff --git a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
index d8c3f3574..9574d6894 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
@@ -28,6 +28,7 @@
 from spark_rapids_tools.api_v1 import AppHandler
 from spark_rapids_tools.api_v1.report_reader import ToolReportReader
 from spark_rapids_tools.storagelib.cspfs import BoundedCspPath, CspFs
+from spark_rapids_tools.utils.data_utils import DataUtils
 
 
 class ResultHandlerBaseMeta:    # pylint: disable=too-few-public-methods
@@ -225,6 +226,64 @@ def is_empty(self) -> bool:
     def get_raw_metrics_path(self) -> Optional[BoundedCspPath]:
         return self.get_reader_path('coreRawMetrics')
 
+    def _get_per_app_table_path(self, table_label: str, app_id: str) -> Optional[BoundedCspPath]:
+        """
+        Resolve the per-application path for a table definition.
+        :param table_label: Label of the table definition.
+        :param app_id: Application ID under the per-app report root.
+        :return: The resolved path or None when the table/app is not available.
+        """
+        reader = self.get_reader_by_tbl(table_label)
+        if reader is None or not reader.is_per_app():
+            return None
+        if app_id not in self.app_handlers:
+            return None
+        table_def = reader.get_table(table_label)
+        if table_def is None:
+            return None
+        return reader.out_path.create_sub_path(f'{app_id}/{table_def.file_name}')
+
+    def get_connect_statements_dir(self, app_id: str) -> Optional[BoundedCspPath]:
+        """
+        Return the connect_statements directory for a given application, if present.
+        """
+        stmt_dir = self._get_per_app_table_path('connectStatements', app_id)
+        if stmt_dir is None or not stmt_dir.exists():
+            return None
+        return stmt_dir
+
+    def list_connect_statement_ops(self, app_id: str) -> List[str]:
+        """
+        Return sorted operation IDs for all statement sidecars under connect_statements/.
+        """
+        stmt_dir = self.get_connect_statements_dir(app_id)
+        if stmt_dir is None:
+            return []
+        op_files = CspFs.glob_path(
+            path=stmt_dir,
+            pattern=re.compile(r'.*\.txt$'),
+            item_type=FileType.File,
+            recursive=False
+        )
+        return sorted([p.base_name().rsplit('.txt', 1)[0] for p in op_files])
+
+    def load_connect_statement(self, app_id: str, operation_id: str) -> Optional[str]:
+        """
+        Load the statementText sidecar for a single Connect operation.
+        """
+        stmt_dir = self.get_connect_statements_dir(app_id)
+        if stmt_dir is None:
+            return None
+        sub_path = stmt_dir.create_sub_path(f'{operation_id}.txt')
+        if not sub_path.exists():
+            return None
+        txt_res = DataUtils.load_txt(sub_path)
+        if not txt_res.success or txt_res.data is None:
+            return None
+        if isinstance(txt_res.data, bytes):
+            return txt_res.decode_txt()
+        return txt_res.data
+
 #########################
 # Type Definitions
 #########################
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
new file mode 100644
index 000000000..3e70ec6c0
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Connect-specific ResultHandler helpers."""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from spark_rapids_tools.api_v1 import ProfCore
+
+
+class TestConnectHelpers(unittest.TestCase):
+    """Verifies listing and reading Connect statement sidecars."""
+
+    sample_app_id = 'application_1234567890_0001'
+
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp()
+        self.prof_output = os.path.join(self.temp_dir, 'rapids_4_spark_profile')
+        self.app_dir = os.path.join(self.prof_output, self.sample_app_id)
+        self.statements_dir = os.path.join(self.app_dir, 'connect_statements')
+        os.makedirs(self.statements_dir, exist_ok=True)
+
+        with open(os.path.join(self.prof_output, 'profiling_status.csv'), 'w', encoding='utf-8') as fh:
+            fh.write('Event Log,Status,App ID,Attempt ID,App Name,Description\n')
+            fh.write(f'/path/to/eventlog,SUCCESS,{self.sample_app_id},0,ProfTest,ok\n')
+
+        with open(os.path.join(self.statements_dir, 'op-1.txt'), 'w', encoding='utf-8') as fh:
+            fh.write('SELECT 1')
+        with open(os.path.join(self.statements_dir, 'op-2.txt'), 'w', encoding='utf-8') as fh:
+            fh.write('SELECT 2')
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_get_connect_statements_dir_returns_per_app_path(self):
+        handler = ProfCore(self.prof_output)
+        path = handler.get_connect_statements_dir(self.sample_app_id)
+        self.assertIsNotNone(path)
+        self.assertEqual(path.base_name(), 'connect_statements')
+
+    def test_list_connect_statement_ops_returns_sorted_operation_ids(self):
+        handler = ProfCore(self.prof_output)
+        ops = handler.list_connect_statement_ops(self.sample_app_id)
+        self.assertEqual(ops, ['op-1', 'op-2'])
+
+    def test_load_connect_statement_reads_file(self):
+        handler = ProfCore(self.prof_output)
+        text = handler.load_connect_statement(self.sample_app_id, 'op-2')
+        self.assertEqual(text, 'SELECT 2')
+
+    def test_load_connect_statement_missing_returns_none(self):
+        handler = ProfCore(self.prof_output)
+        self.assertIsNone(handler.load_connect_statement(self.sample_app_id, 'missing-op'))
+        self.assertIsNone(handler.load_connect_statement('missing-app', 'op-1'))

From 244252e150eff5f2513d9ed885780dbf56ee7ddb Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:33:01 -0700
Subject: [PATCH 10/19] test(connect): add profiler-to-python roundtrip golden
 fixture

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../api/test_connect_e2e.py                   | 71 +++++++++++++++++++
 .../local-connect-e2e/connect_operations.csv  |  3 +
 .../local-connect-e2e/connect_sessions.csv    |  2 +
 .../connect_statements/op-bbb-222.txt         |  1 +
 .../profiling_status.csv                      |  2 +
 5 files changed, 79 insertions(+)
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_sessions.csv
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_statements/op-bbb-222.txt
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/profiling_status.csv

diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
new file mode 100644
index 000000000..66c3936b6
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Golden roundtrip checks for Spark Connect profiler output."""
+
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+import pandas as pd
+
+from spark_rapids_tools.api_v1 import ProfCore
+
+
+class TestConnectE2E(unittest.TestCase):
+    """Verifies a committed Connect profiler output tree is readable end to end."""
+
+    sample_app_id = 'local-connect-e2e'
+    expected_statement = 'common { plan_id: 0 } range { start: 0 end: 100 step: 1 }\n'
+
+    def setUp(self):
+        self.temp_dir = tempfile.mkdtemp()
+        fixture_root = Path(__file__).resolve().parents[1] / 'resources' / 'connect_e2e'
+        self.prof_output = Path(self.temp_dir) / 'rapids_4_spark_profile'
+        shutil.copytree(fixture_root / 'rapids_4_spark_profile', self.prof_output)
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_connect_operations_roundtrip_via_csv_and_api(self):
+        handler = ProfCore(str(self.prof_output))
+        api_res = handler.csv('connectOperations').app(self.sample_app_id).load()
+        self.assertTrue(api_res.success)
+
+        csv_path = self.prof_output / self.sample_app_id / 'connect_operations.csv'
+        raw_df = pd.read_csv(csv_path)
+        api_df = api_res.data
+
+        self.assertEqual(list(raw_df['operationId']), ['op-bbb-222', 'op-ccc-333'])
+        self.assertEqual(list(api_df['operationId'].astype(str)), list(raw_df['operationId']))
+        self.assertEqual(api_df.loc[api_df['operationId'] == 'op-bbb-222', 'sqlIds'].iat[0], '42')
+        self.assertEqual(api_df.loc[api_df['operationId'] == 'op-bbb-222', 'jobIds'].iat[0], '7')
+        self.assertEqual(api_df.loc[api_df['operationId'] == 'op-ccc-333', 'status'].iat[0], 'FAILED')
+
+        sessions_res = handler.csv('connectSessions').app(self.sample_app_id).load()
+        self.assertTrue(sessions_res.success)
+        self.assertEqual(int(sessions_res.data['operationCount'].iat[0]), 2)
+
+    def test_connect_statement_sidecar_roundtrip(self):
+        handler = ProfCore(str(self.prof_output))
+
+        stmt_dir = handler.get_connect_statements_dir(self.sample_app_id)
+        self.assertIsNotNone(stmt_dir)
+        self.assertEqual(stmt_dir.base_name(), 'connect_statements')
+        self.assertEqual(handler.list_connect_statement_ops(self.sample_app_id), ['op-bbb-222'])
+        self.assertEqual(
+            handler.load_connect_statement(self.sample_app_id, 'op-bbb-222'),
+            self.expected_statement)
+        self.assertIsNone(handler.load_connect_statement(self.sample_app_id, 'op-ccc-333'))
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
new file mode 100644
index 000000000..34dc424fb
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
@@ -0,0 +1,3 @@
+appID,operationId,sessionId,userId,jobTag,startTime,analyzeTime,readyForExecTime,finishTime,closeTime,failTime,cancelTime,durationMs,analyzePhaseMs,planPhaseMs,execPhaseMs,resultDeliveryPhaseMs,status,producedRowCount,errorMessage,sqlIds,jobIds,statementFile,statementBytes,statementTruncated
+local-connect-e2e,op-bbb-222,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-bbb-222,120000,121000,121500,125000,125500,,,5500,1000,500,3500,500,SUCCEEDED,10,,42,7,op-bbb-222.txt,57,false
+local-connect-e2e,op-ccc-333,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-ccc-333,130000,,,,,131000,,1000,-1,-1,-1,-1,FAILED,,boom,,,,0,false
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_sessions.csv b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_sessions.csv
new file mode 100644
index 000000000..b274c772c
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_sessions.csv
@@ -0,0 +1,2 @@
+appID,sessionId,userId,startTime,endTime,durationMs,operationCount
+local-connect-e2e,sess-aaa-111,userA,110000,190000,80000,2
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_statements/op-bbb-222.txt b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_statements/op-bbb-222.txt
new file mode 100644
index 000000000..b1b897177
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_statements/op-bbb-222.txt
@@ -0,0 +1 @@
+common { plan_id: 0 } range { start: 0 end: 100 step: 1 }
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/profiling_status.csv b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/profiling_status.csv
new file mode 100644
index 000000000..8513b4bf2
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/profiling_status.csv
@@ -0,0 +1,2 @@
+Event Log,Status,App ID,Attempt ID,App Name,Description
+/path/to/connect_with_sql_job,SUCCESS,local-connect-e2e,0,ConnectE2E,Processing time: 1ms

From afea0dde23201191524337dd569097edd9a6d2f4 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:40:13 -0700
Subject: [PATCH 11/19] test(connect): assert golden connect CSV header order

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../tests/spark_rapids_tools_ut/api/test_connect_e2e.py  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
index 66c3936b6..117ac5c9e 100644
--- a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
@@ -29,6 +29,13 @@ class TestConnectE2E(unittest.TestCase):
 
     sample_app_id = 'local-connect-e2e'
     expected_statement = 'common { plan_id: 0 } range { start: 0 end: 100 step: 1 }\n'
+    expected_operation_columns = [
+        'appID', 'operationId', 'sessionId', 'userId', 'jobTag', 'startTime',
+        'analyzeTime', 'readyForExecTime', 'finishTime', 'closeTime', 'failTime',
+        'cancelTime', 'durationMs', 'analyzePhaseMs', 'planPhaseMs', 'execPhaseMs',
+        'resultDeliveryPhaseMs', 'status', 'producedRowCount', 'errorMessage',
+        'sqlIds', 'jobIds', 'statementFile', 'statementBytes', 'statementTruncated'
+    ]
 
     def setUp(self):
         self.temp_dir = tempfile.mkdtemp()
@@ -48,6 +55,8 @@ def test_connect_operations_roundtrip_via_csv_and_api(self):
         raw_df = pd.read_csv(csv_path)
         api_df = api_res.data
 
+        self.assertEqual(list(raw_df.columns), self.expected_operation_columns)
+        self.assertEqual(list(api_df.columns), self.expected_operation_columns)
         self.assertEqual(list(raw_df['operationId']), ['op-bbb-222', 'op-ccc-333'])
         self.assertEqual(list(api_df['operationId'].astype(str)), list(raw_df['operationId']))
         self.assertEqual(api_df.loc[api_df['operationId'] == 'op-bbb-222', 'sqlIds'].iat[0], '42')

From 225ba74902a6d067c7b8633dde3ef6b34349beda Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:49:12 -0700
Subject: [PATCH 12/19] chore(connect): fix Scala style issues from
 verification

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../spark/rapids/tool/profiling/ConnectStatementWriter.scala | 2 +-
 .../spark/rapids/tool/views/QualRawReportGenerator.scala     | 2 +-
 .../rapids/tool/profiling/ConnectProfileResultsSuite.scala   | 1 -
 .../rapids/tool/profiling/ConnectProfilerOutputSuite.scala   | 5 ++---
 .../tool/qualification/QualificationConnectOutputSuite.scala | 4 ++--
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
index a201fb282..bff5fab9e 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
@@ -75,4 +75,4 @@ object ConnectStatementWriter extends Logging {
     }
     builder.result()
   }
-}
\ No newline at end of file
+}
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
index df359f19b..52f06295a 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
@@ -17,7 +17,7 @@
 package com.nvidia.spark.rapids.tool.views
 
 import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator}
-import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, Profiler, SQLAccumProfileResults}
+import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, Profiler, ProfileResult, SQLAccumProfileResults}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
index 15b0ec105..63bab0ef2 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
@@ -17,7 +17,6 @@
 package com.nvidia.spark.rapids.tool.profiling
 
 import com.nvidia.spark.rapids.tool.views.OutHeaderRegistry
-
 import org.scalatest.funsuite.AnyFunSuite
 
 class ConnectProfileResultsSuite extends AnyFunSuite {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
index f268786a2..90b7e6723 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -21,7 +21,6 @@ import java.nio.file.{Files, Path, Paths}
 import java.util.Comparator
 
 import scala.collection.mutable
-import scala.io.Source
 
 import com.nvidia.spark.rapids.BaseNoSparkSuite
 import com.nvidia.spark.rapids.tool.EventLogPathProcessor
@@ -29,7 +28,7 @@ import com.nvidia.spark.rapids.tool.views.OutHeaderRegistry
 
 import org.apache.spark.sql.TrampolineUtil
 import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
-import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
+import org.apache.spark.sql.rapids.tool.util.{RapidsToolsConfUtil, UTF8Source}
 
 /**
  * Tests Task 5 of Spark Connect Phase 3 (#2065): wiring
@@ -78,7 +77,7 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
   }
 
   private def readAllLines(path: Path): Seq[String] = {
-    val src = Source.fromFile(path.toFile, StandardCharsets.UTF_8.name())
+    val src = UTF8Source.fromFile(path.toFile)
     try {
       src.getLines().toList
     } finally {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
index 732c0f0f9..bad6e7937 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
@@ -21,7 +21,6 @@ import java.nio.file.{Files, Path, Paths}
 import java.util.Comparator
 
 import scala.collection.mutable
-import scala.io.Source
 
 import com.nvidia.spark.rapids.BaseNoSparkSuite
 import com.nvidia.spark.rapids.tool.profiling.{ConnectOperationInfo, ConnectSessionInfo}
@@ -29,6 +28,7 @@ import com.nvidia.spark.rapids.tool.views.QualRawReportGenerator
 
 import org.apache.spark.sql.TrampolineUtil
 import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
+import org.apache.spark.sql.rapids.tool.util.UTF8Source
 
 /**
  * Verifies that the qualification raw-metrics writer emits the same Spark
@@ -73,7 +73,7 @@ class QualificationConnectOutputSuite extends BaseNoSparkSuite {
   }
 
   private def readAllLines(path: Path): Seq[String] = {
-    val src = Source.fromFile(path.toFile, StandardCharsets.UTF_8.name())
+    val src = UTF8Source.fromFile(path.toFile)
     try {
       src.getLines().toList
     } finally {

From f0fc5b4fe78ac55c4de76c7e46ce86c18b5b9e10 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 17:53:31 -0700
Subject: [PATCH 13/19] chore: refresh expired license headers

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 core/src/main/resources/configs/reports/profCoreReport.yaml     | 2 +-
 .../spark/rapids/tool/profiling/ProfileOutputWriter.scala       | 2 +-
 .../com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala  | 2 +-
 .../nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala | 2 +-
 user_tools/src/spark_rapids_tools/api_v1/builder.py             | 2 +-
 user_tools/src/spark_rapids_tools/api_v1/report_loader.py       | 2 +-
 user_tools/src/spark_rapids_tools/api_v1/result_handler.py      | 2 +-
 user_tools/src/spark_rapids_tools/enums.py                      | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/core/src/main/resources/configs/reports/profCoreReport.yaml b/core/src/main/resources/configs/reports/profCoreReport.yaml
index dd0d589dc..9d9820b8b 100644
--- a/core/src/main/resources/configs/reports/profCoreReport.yaml
+++ b/core/src/main/resources/configs/reports/profCoreReport.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2025-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
index 4b2046b75..46c0fc54c 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileOutputWriter.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
index 80675aa7c..b3d5519d9 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
index 52f06295a..5729ad48e 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 0c560d7aa..3c5dcedcc 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2025-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/user_tools/src/spark_rapids_tools/api_v1/report_loader.py b/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
index aba2a7b01..eb7b82c6f 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/report_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2025-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
index 9574d6894..1ec1c434e 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2025-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/user_tools/src/spark_rapids_tools/enums.py b/user_tools/src/spark_rapids_tools/enums.py
index 16e059cac..e00cf96d0 100644
--- a/user_tools/src/spark_rapids_tools/enums.py
+++ b/user_tools/src/spark_rapids_tools/enums.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# Copyright (c) 2023-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From b2746b5f0594e3d53094409ed3b2a1217dffed9e Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Tue, 21 Apr 2026 18:01:45 -0700
Subject: [PATCH 14/19] fix(connect): sanitize sidecar paths and keep
 session-only logs

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../profiling/ConnectStatementWriter.scala    | 14 ++++++--
 .../spark/sql/rapids/tool/AppBase.scala       |  2 +-
 .../ConnectProfilerOutputSuite.scala          | 36 +++++++++++++++++++
 .../ConnectStatementWriterSuite.scala         | 17 +++++++++
 .../QualificationConnectOutputSuite.scala     | 31 ++++++++++++++++
 5 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
index bff5fab9e..bbb9ee731 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
@@ -38,6 +38,11 @@ object ConnectStatementWriter extends Logging {
 
   val SUB_DIR: String = "connect_statements"
   val FILE_EXTENSION: String = ".txt"
+  private val UnsafePathChars = "[^A-Za-z0-9._-]".r
+
+  private def sanitizeOperationId(operationId: String): String = {
+    UnsafePathChars.replaceAllIn(operationId, "_")
+  }
 
   /**
    * Writes each operation's `statementText` to
@@ -51,7 +56,7 @@ object ConnectStatementWriter extends Logging {
   def writeStatementFiles(
       rootDir: String,
       ops: Iterable[ConnectOperationInfo]): Map[String, String] = {
-    val subDirPath = Paths.get(rootDir, SUB_DIR)
+    val subDirPath = Paths.get(rootDir, SUB_DIR).toAbsolutePath.normalize()
     var subDirCreated = false
     val builder = Map.newBuilder[String, String]
     ops.foreach { op =>
@@ -62,8 +67,11 @@ object ConnectStatementWriter extends Logging {
             Files.createDirectories(subDirPath)
             subDirCreated = true
           }
-          val basename = s"${op.operationId}$FILE_EXTENSION"
-          val target = subDirPath.resolve(basename)
+          val safeId = sanitizeOperationId(op.operationId)
+          val basename = s"$safeId$FILE_EXTENSION"
+          val target = subDirPath.resolve(basename).normalize()
+          require(target.startsWith(subDirPath),
+            s"Refusing to write Connect statement sidecar outside $subDirPath: $target")
           Files.write(target, text.getBytes(StandardCharsets.UTF_8))
           builder += (op.operationId -> basename)
         } catch {
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
index c74440dad..7634b2fac 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala
@@ -147,7 +147,7 @@ abstract class AppBase(
   val operationIdToSqlIds: HashMap[String, HashSet[Long]] = HashMap.empty
   // operationId -> jobIDs discovered via SparkListenerJobStart.properties["spark.job.tags"].
   val operationIdToJobIds: HashMap[String, HashSet[Int]] = HashMap.empty
-  def isConnectMode: Boolean = connectOperations.nonEmpty
+  def isConnectMode: Boolean = connectSessions.nonEmpty || connectOperations.nonEmpty
 
   def sqlPlans: immutable.Map[Long, SparkPlanInfo] = sqlManager.getPlanInfos
 
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
index 90b7e6723..9eb983164 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -237,4 +237,40 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
       }
     }
   }
+
+  test("writeConnectTables emits connect_sessions.csv for session-only Connect logs") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+
+      assert(app.isConnectMode, "Session-only app should report Connect mode")
+
+      val tmpDir = Files.createTempDirectory("prof-connect-out-").toFile
+      try {
+        val writer = new ProfileOutputWriter(tmpDir.getAbsolutePath, "profile",
+          numOutputRows = 1000, outputCSV = true)
+        try {
+          Profiler.writeConnectTables(writer, app)
+        } finally {
+          writer.close()
+        }
+
+        val sessionsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_sessions.csv")
+        val operationsCsv = Paths.get(tmpDir.getAbsolutePath, "connect_operations.csv")
+        assert(Files.exists(sessionsCsv), s"expected $sessionsCsv to exist")
+        assert(!Files.exists(operationsCsv),
+          s"expected no $operationsCsv for session-only Connect app")
+
+        val sessionLines = readAllLines(sessionsCsv)
+        assert(sessionLines.size == 2, s"unexpected session rows: $sessionLines")
+        assert(sessionLines(1).contains("sess-1"),
+          s"expected sess-1 row in session output: ${sessionLines(1)}")
+      } finally {
+        deleteRecursively(tmpDir.toPath)
+      }
+    }
+  }
 }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
index e472e89fc..cd703fa03 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
@@ -121,4 +121,21 @@ class ConnectStatementWriterSuite extends AnyFunSuite {
       deleteRecursively(tmpDir)
     }
   }
+
+  test("sanitizes operationId before resolving sidecar path") {
+    val tmpDir = Files.createTempDirectory("connect-stmt-writer-")
+    try {
+      val op = makeOp("../../etc/foo", "range(0, 10)")
+      val result = ConnectStatementWriter.writeStatementFiles(tmpDir.toString, Seq(op))
+      val expectedBasename = ".._.._etc_foo.txt"
+      assert(result == Map("../../etc/foo" -> expectedBasename),
+        s"expected sanitized basename map, got $result")
+      val expectedPath = tmpDir.resolve(ConnectStatementWriter.SUB_DIR).resolve(expectedBasename)
+      assert(Files.exists(expectedPath), s"expected sidecar at $expectedPath")
+      assert(expectedPath.normalize().startsWith(tmpDir.resolve(ConnectStatementWriter.SUB_DIR)),
+        s"sidecar should remain under ${ConnectStatementWriter.SUB_DIR}: $expectedPath")
+    } finally {
+      deleteRecursively(tmpDir)
+    }
+  }
 }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
index bad6e7937..68833a7fe 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
@@ -152,4 +152,35 @@ class QualificationConnectOutputSuite extends BaseNoSparkSuite {
       }
     }
   }
+
+  test("qualification raw metrics emit connect_sessions.csv for session-only Connect logs") {
+    withQualificationApp(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+
+      assert(app.isConnectMode, "Session-only qualification app should report Connect mode")
+
+      val tmpDir = Files.createTempDirectory("qual-connect-out-")
+      try {
+        QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(tmpDir.toString, app)
+
+        val appDir = tmpDir.resolve("raw_metrics").resolve(app.appId)
+        val sessionsCsv = appDir.resolve("connect_sessions.csv")
+        val operationsCsv = appDir.resolve("connect_operations.csv")
+        assert(Files.exists(sessionsCsv), s"expected $sessionsCsv to exist")
+        assert(!Files.exists(operationsCsv),
+          s"expected no $operationsCsv for session-only Connect app")
+
+        val sessionLines = readAllLines(sessionsCsv)
+        assert(sessionLines.size == 2, s"unexpected session rows: $sessionLines")
+        assert(sessionLines(1).contains("sess-1"),
+          s"expected sess-1 row in session output: ${sessionLines(1)}")
+      } finally {
+        deleteRecursively(tmpDir)
+      }
+    }
+  }
 }

From 1ad999a2e87fbec7e340ffefd48b3b7f4b18bad8 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 22 Apr 2026 14:59:56 -0700
Subject: [PATCH 15/19] fix(connect): tighten sidecar handling and trim schema

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../configs/reports/connectReport.yaml        | 31 +-----
 .../profiling/ConnectProfileResults.scala     | 54 +----------
 .../rapids/tool/profiling/ProfileArgs.scala   |  8 +-
 .../rapids/tool/profiling/Profiler.scala      | 20 ++--
 .../tool/qualification/Qualification.scala    |  5 +-
 .../qualification/QualificationArgs.scala     |  9 +-
 .../qualification/QualificationMain.scala     |  5 +-
 .../rapids/tool/views/OutHeaderRegistry.scala |  8 +-
 .../tool/views/QualRawReportGenerator.scala   |  5 +-
 .../ConnectProfileResultsSuite.scala          | 14 +--
 .../ConnectProfilerOutputSuite.scala          | 95 +++++++++++++++----
 .../QualificationConnectOutputSuite.scala     | 35 ++++++-
 .../api_v1/result_handler.py                  | 11 ++-
 .../api/test_connect_e2e.py                   |  6 +-
 .../api/test_connect_helpers.py               |  6 ++
 .../local-connect-e2e/connect_operations.csv  |  6 +-
 16 files changed, 179 insertions(+), 139 deletions(-)

diff --git a/core/src/main/resources/configs/reports/connectReport.yaml b/core/src/main/resources/configs/reports/connectReport.yaml
index 7a2e12c50..7205d6791 100644
--- a/core/src/main/resources/configs/reports/connectReport.yaml
+++ b/core/src/main/resources/configs/reports/connectReport.yaml
@@ -51,9 +51,8 @@ reportDefinitions:
             description: Number of operations observed in the session.
       - label: connectOperations
         description: >-
-          One row per Spark Connect operation with lifecycle timestamps,
-          derived phase durations, linked SQL/job identifiers, and statement
-          sidecar metadata.
+          One row per Spark Connect operation with core lifecycle timestamps,
+          linked SQL/job identifiers, and statement sidecar metadata.
         fileName: connect_operations.csv
         scope: per-app
         columns:
@@ -75,12 +74,6 @@ reportDefinitions:
           - name: startTime
             dataType: Long
             description: Epoch millis when the operation started.
-          - name: analyzeTime
-            dataType: Long
-            description: Epoch millis when analysis completed, if observed.
-          - name: readyForExecTime
-            dataType: Long
-            description: Epoch millis when planning completed, if observed.
           - name: finishTime
             dataType: Long
             description: Epoch millis when execution completed, if observed.
@@ -96,24 +89,9 @@ reportDefinitions:
           - name: durationMs
             dataType: Long
             description: Derived end-to-end operation duration in milliseconds.
-          - name: analyzePhaseMs
-            dataType: Long
-            description: Time between start and analyze events, or -1.
-          - name: planPhaseMs
-            dataType: Long
-            description: Time between analyze and ready-for-execution events, or -1.
-          - name: execPhaseMs
-            dataType: Long
-            description: Time between ready-for-execution and finish events, or -1.
-          - name: resultDeliveryPhaseMs
-            dataType: Long
-            description: Time between finish and close events, or -1.
           - name: status
             dataType: String
             description: Derived operation status (RUNNING, SUCCEEDED, FAILED, CANCELED).
-          - name: producedRowCount
-            dataType: Long
-            description: Row count reported by the finish event, if present.
           - name: errorMessage
             dataType: String
             description: Error message reported by a failure event, if present.
@@ -126,9 +104,6 @@ reportDefinitions:
           - name: statementFile
             dataType: String
             description: Sidecar filename under connect_statements/ for this operation, if written.
-          - name: statementBytes
-            dataType: Long
-            description: UTF-8 byte length of statementText.
           - name: statementTruncated
             dataType: Boolean
             description: True when statementText includes Spark's truncation marker.
@@ -136,7 +111,7 @@ reportDefinitions:
         description: >-
           Directory of per-operation statementText sidecars. Each
           <operationId>.txt file contains the protobuf debug-format text of the
-          Connect operation statement.
+          Connect operation statement when Connect statement sidecars are enabled.
         fileName: connect_statements
         fileFormat: DIRECTORY
         scope: per-app
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
index 60ecec2d6..ec9163dba 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
@@ -65,11 +65,10 @@ case class ConnectSessionProfileResult(
 }
 
 /**
- * CSV row for a single Spark Connect operation. Captures the full lifecycle
- * (start/analyze/readyForExec/finish/close/fail/cancel timestamps), the
- * derived phase durations, status, producedRowCount, error message, and the
- * joined sqlIDs/jobIDs. Also captures statement-file provenance for the
- * separate `statements/<operationId>.txt` artifact.
+ * CSV row for a single Spark Connect operation. Captures the core lifecycle
+ * (start/finish/close/fail/cancel timestamps), derived status, error message,
+ * and the joined sqlIDs/jobIDs. Also captures statement-file provenance for
+ * the separate `statements/<operationId>.txt` artifact.
  *
  * sqlIds and jobIds are serialized semicolon-separated (to keep the CSV
  * single-column and avoid quoting issues).
@@ -81,24 +80,16 @@ case class ConnectOperationProfileResult(
     userId: String,
     jobTag: String,
     startTime: Long,
-    analyzeTime: Option[Long],
-    readyForExecTime: Option[Long],
     finishTime: Option[Long],
     closeTime: Option[Long],
     failTime: Option[Long],
     cancelTime: Option[Long],
     durationMs: Long,
-    analyzePhaseMs: Long,
-    planPhaseMs: Long,
-    execPhaseMs: Long,
-    resultDeliveryPhaseMs: Long,
     status: String,
-    producedRowCount: Option[Long],
     errorMessage: Option[String],
     sqlIds: Seq[Long],
     jobIds: Seq[Int],
     statementFile: Option[String],
-    statementBytes: Long,
     statementTruncated: Boolean) extends ProfileResult {
 
   override def outputHeaders: Array[String] = {
@@ -113,24 +104,16 @@ case class ConnectOperationProfileResult(
       userId,
       jobTag,
       startTime.toString,
-      analyzeTime.map(_.toString).orNull,
-      readyForExecTime.map(_.toString).orNull,
       finishTime.map(_.toString).orNull,
       closeTime.map(_.toString).orNull,
       failTime.map(_.toString).orNull,
       cancelTime.map(_.toString).orNull,
       durationMs.toString,
-      analyzePhaseMs.toString,
-      planPhaseMs.toString,
-      execPhaseMs.toString,
-      resultDeliveryPhaseMs.toString,
       status,
-      producedRowCount.map(_.toString).orNull,
       errorMessage.getOrElse(""),
       sqlIds.mkString(";"),
       jobIds.mkString(";"),
       statementFile.getOrElse(""),
-      statementBytes.toString,
       statementTruncated.toString)
   }
 
@@ -142,24 +125,16 @@ case class ConnectOperationProfileResult(
       StringUtils.reformatCSVString(userId),
       StringUtils.reformatCSVString(jobTag),
       startTime.toString,
-      analyzeTime.map(_.toString).orNull,
-      readyForExecTime.map(_.toString).orNull,
       finishTime.map(_.toString).orNull,
       closeTime.map(_.toString).orNull,
       failTime.map(_.toString).orNull,
       cancelTime.map(_.toString).orNull,
       durationMs.toString,
-      analyzePhaseMs.toString,
-      planPhaseMs.toString,
-      execPhaseMs.toString,
-      resultDeliveryPhaseMs.toString,
       StringUtils.reformatCSVString(status),
-      producedRowCount.map(_.toString).orNull,
       StringUtils.reformatCSVString(errorMessage.getOrElse("")),
       StringUtils.reformatCSVString(sqlIds.mkString(";")),
       StringUtils.reformatCSVString(jobIds.mkString(";")),
       StringUtils.reformatCSVString(statementFile.getOrElse("")),
-      statementBytes.toString,
       statementTruncated.toString)
   }
 }
@@ -174,18 +149,6 @@ object ConnectOperationProfileResult {
    */
   private[profiling] val TruncationMarker: String = "[truncated(size="
 
-  /**
-   * Returns `b - a` when both are defined, otherwise `-1`. Used to derive
-   * phase durations where an absent timestamp means the operation never
-   * reached that phase.
-   */
-  private def diff(a: Option[Long], b: Option[Long]): Long = {
-    (a, b) match {
-      case (Some(av), Some(bv)) => bv - av
-      case _ => -1L
-    }
-  }
-
   /**
    * Derives operation status from the observed lifecycle timestamps.
    * Priority: CANCELED -> FAILED -> SUCCEEDED -> RUNNING.
@@ -208,7 +171,6 @@ object ConnectOperationProfileResult {
     val endForDuration =
       op.closeTime.orElse(op.finishTime).orElse(op.failTime).orElse(op.cancelTime)
     val durationMs = endForDuration.map(_ - op.startTime).getOrElse(-1L)
-    val statementBytes = op.statementText.getBytes("UTF-8").length.toLong
     val statementTruncated = op.statementText.contains(TruncationMarker)
     ConnectOperationProfileResult(
       appId = appId,
@@ -217,24 +179,16 @@ object ConnectOperationProfileResult {
       userId = op.userId,
       jobTag = op.jobTag,
       startTime = op.startTime,
-      analyzeTime = op.analyzeTime,
-      readyForExecTime = op.readyForExecTime,
       finishTime = op.finishTime,
       closeTime = op.closeTime,
       failTime = op.failTime,
       cancelTime = op.cancelTime,
       durationMs = durationMs,
-      analyzePhaseMs = diff(Some(op.startTime), op.analyzeTime),
-      planPhaseMs = diff(op.analyzeTime, op.readyForExecTime),
-      execPhaseMs = diff(op.readyForExecTime, op.finishTime),
-      resultDeliveryPhaseMs = diff(op.finishTime, op.closeTime),
       status = deriveStatus(op),
-      producedRowCount = op.producedRowCount,
       errorMessage = op.errorMessage,
       sqlIds = sqlIds,
       jobIds = jobIds,
       statementFile = statementFile,
-      statementBytes = statementBytes,
       statementTruncated = statementTruncated)
   }
 }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala
index 81a253ebf..c1f49fb55 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,6 +91,12 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
   val csv: ScallopOption[Boolean] =
     opt[Boolean](required = false,
       descr = "Output each table to a CSV file as well creating the summary text file.")
+  val connectStatements: ScallopOption[Boolean] =
+    toggle("connect-statements",
+      default = Some(false),
+      prefix = "no-",
+      descrYes = "Write Spark Connect statementText sidecar files. Disabled by default.",
+      descrNo = "Do not write Spark Connect statementText sidecar files.")
   val timeout: ScallopOption[Long] =
     opt[Long](required = false,
       descr = "Maximum time in seconds to wait for the event logs to be processed. " +
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index aa976e53f..a25ca7109 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -55,6 +55,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
     s"/${Profiler.SUBDIR}"
   private val numOutputRows = appArgs.numOutputRows.getOrElse(1000)
   private val outputCSV: Boolean = appArgs.csv()
+  private val writeConnectStatements: Boolean = appArgs.connectStatements()
   private val useAutoTuner: Boolean = appArgs.autoTuner()
   private val outputAlignedSQLIds: Boolean = appArgs.outputSqlIdsAligned()
   private val enableDiagnosticViews: Boolean = appArgs.enableDiagnosticViews()
@@ -417,7 +418,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
     profileOutputWriter.writeTable(ProfRemovedBLKMgrView.getLabel, app.removedBMs)
     profileOutputWriter.writeCSVTable(ProfRemovedExecutorView.getLabel, app.removedExecutors)
     profileOutputWriter.writeCSVTable("Unsupported SQL Plan", app.unsupportedOps)
-    Profiler.writeConnectTables(profileOutputWriter, profilerResult.app)
+    Profiler.writeConnectTables(profileOutputWriter, profilerResult.app, writeConnectStatements)
     if (outputAlignedSQLIds) {
       profileOutputWriter.writeTable(
         ProfSQLPlanAlignedView.getLabel, app.sqlCleanedAlignedIds,
@@ -488,13 +489,14 @@ object Profiler {
    * `writeCSVTable` returns early on empty input, so non-Connect apps produce
    * no file at all (matches the behavior of every other per-app table).
    *
-   * Each operation's `statementText` is written to a sidecar file under
-   * `<perAppDir>/connect_statements/<operationId>.txt` and the basename is
-   * recorded in the `statementFile` column of `connect_operations.csv`.
+   * When enabled, each operation's `statementText` is written to a sidecar file
+   * under `<perAppDir>/connect_statements/<operationId>.txt` and the basename
+   * is recorded in the `statementFile` column of `connect_operations.csv`.
    */
   def writeConnectTables(
       writer: ProfileOutputWriter,
-      app: AppBase): Unit = {
+      app: AppBase,
+      writeStatementSidecars: Boolean = false): Unit = {
     if (!app.isConnectMode) return
     val appId = app.appId
     val sessionRows = app.connectSessions.values.toSeq.sortBy(_.sessionId).map { s =>
@@ -508,8 +510,12 @@ object Profiler {
     }
     writer.writeCSVTable("Connect Sessions", sessionRows)
     val statementFiles: Map[String, String] =
-      ConnectStatementWriter.writeStatementFiles(
-        writer.outputDir, app.connectOperations.values)
+      if (writeStatementSidecars) {
+        ConnectStatementWriter.writeStatementFiles(
+          writer.outputDir, app.connectOperations.values)
+      } else {
+        Map.empty
+      }
     val opRows = app.connectOperations.values.toSeq.sortBy(_.operationId).map { op =>
       ConnectOperationProfileResult.from(
         appId = appId,
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
index f84ed895e..78784c39a 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@ class Qualification(
     enablePB: Boolean,
     reportSqlLevel: Boolean,
     maxSQLDescLength: Int,
+    writeConnectStatements: Boolean,
     mlOpsEnabled: Boolean,
     penalizeTransitions: Boolean,
     tunerContext: Option[TunerContext],
@@ -155,7 +156,7 @@ class Qualification(
           val dsInfo =
             AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
               QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(
-                outputDir, app)
+                outputDir, app, writeConnectStatements)
             }.getOrElse(Seq.empty)
           val qualSumInfo = app.aggregateStats()
           AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala
index 31086f995..44465f01d 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationArgs.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -157,6 +157,13 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
   val perSql : ScallopOption[Boolean] =
     opt[Boolean](required = false,
       descr = "Report at the individual SQL query level.")
+  val connectStatements: ScallopOption[Boolean] =
+    toggle("connect-statements",
+      default = Some(false),
+      prefix = "no-",
+      descrYes = "Write Spark Connect statementText sidecar files in raw_metrics. " +
+        "Disabled by default.",
+      descrNo = "Do not write Spark Connect statementText sidecar files.")
   val maxSqlDescLength: ScallopOption[Int] =
     opt[Int](required = false,
       descr = "Maximum length of the SQL description string output with the " +
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala
index 95773212c..cf4d422c7 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/QualificationMain.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,7 +119,8 @@ object QualificationMain extends Logging {
     }
     val qual = new Qualification(outputDirectory, hadoopConf, timeout,
       nThreads, pluginTypeChecker,
-      enablePB, reportSqlLevel, maxSQLDescLength, mlOpsEnabled, penalizeTransitions,
+      enablePB, reportSqlLevel, maxSQLDescLength, appArgs.connectStatements(),
+      mlOpsEnabled, penalizeTransitions,
       tunerContext, appArgs.clusterReport(), appArgs.platform(),
       appArgs.targetClusterInfo.toOption)
     val res = qual.qualifyApps(filteredLogs)
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
index b3d5519d9..9a3b301c4 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala
@@ -315,10 +315,8 @@ object OutHeaderRegistry {
         "operationCount"),
     "ConnectOperationProfileResult" ->
       Array("appID", "operationId", "sessionId", "userId", "jobTag",
-        "startTime", "analyzeTime", "readyForExecTime", "finishTime", "closeTime",
-        "failTime", "cancelTime", "durationMs",
-        "analyzePhaseMs", "planPhaseMs", "execPhaseMs", "resultDeliveryPhaseMs",
-        "status", "producedRowCount", "errorMessage",
-        "sqlIds", "jobIds", "statementFile", "statementBytes", "statementTruncated")
+        "startTime", "finishTime", "closeTime", "failTime", "cancelTime", "durationMs",
+        "status", "errorMessage",
+        "sqlIds", "jobIds", "statementFile", "statementTruncated")
   ) // End of outputHeaders map initialization
 }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
index 5729ad48e..d4df4a3a1 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
@@ -64,7 +64,8 @@ object QualRawReportGenerator extends Logging {
 
   def generateRawMetricQualViewAndGetDataSourceInfo(
       rootDir: String,
-      app: QualificationAppInfo): Seq[DataSourceProfileResult] = {
+      app: QualificationAppInfo,
+      writeConnectStatements: Boolean = false): Seq[DataSourceProfileResult] = {
     val metricsDirectory = s"$rootDir/raw_metrics/${app.appId}"
     val sqlPlanAnalyzer = AppSQLPlanAnalyzer(app)
     var dataSourceInfo: Seq[DataSourceProfileResult] = Seq.empty
@@ -113,7 +114,7 @@ object QualRawReportGenerator extends Logging {
         QualRemovedExecutorView.getLabel, QualRemovedExecutorView.getRawView(Seq(app)))
       // we only need to write the CSV report of the WriteOps
       pWriter.writeCSVTable(QualWriteOpsView.getLabel, QualWriteOpsView.getRawView(Seq(app)))
-      Profiler.writeConnectTables(pWriter, app)
+      Profiler.writeConnectTables(pWriter, app, writeConnectStatements)
     } catch {
       case e: Exception =>
         logError(s"Error generating raw metrics for ${app.appId}: ${e.getMessage}")
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
index 63bab0ef2..edb88ea98 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
@@ -84,7 +84,7 @@ class ConnectProfileResultsSuite extends AnyFunSuite {
     assert(opRow.convertToSeq().length == opRow.outputHeaders.length)
   }
 
-  test("ConnectOperationProfileResult derives status and phases correctly") {
+  test("ConnectOperationProfileResult derives status and core columns correctly") {
     val op = new ConnectOperationInfo(
       operationId = "op", sessionId = "s", userId = "u",
       jobTag = "tag", statementText = "SELECT 1", startTime = 100L)
@@ -99,16 +99,12 @@ class ConnectProfileResultsSuite extends AnyFunSuite {
     assert(operationCol(row, "operationId") == "op")
     assert(operationCol(row, "status") == "SUCCEEDED")
     assert(operationCol(row, "durationMs") == "500")
-    assert(operationCol(row, "analyzePhaseMs") == "100")
-    assert(operationCol(row, "planPhaseMs") == "100")
-    assert(operationCol(row, "execPhaseMs") == "200")
-    assert(operationCol(row, "resultDeliveryPhaseMs") == "100")
+    assert(operationCol(row, "finishTime") == "500")
+    assert(operationCol(row, "closeTime") == "600")
     assert(operationCol(row, "sqlIds") == "42")
     assert(operationCol(row, "jobIds") == "7")
     assert(operationCol(row, "statementFile") == "op.txt")
-    assert(operationCol(row, "statementBytes") == "8")
     assert(operationCol(row, "statementTruncated") == "false")
-    assert(operationCol(row, "producedRowCount") == "1")
   }
 
   test("ConnectOperationProfileResult derives FAILED status with errorMessage") {
@@ -124,10 +120,8 @@ class ConnectProfileResultsSuite extends AnyFunSuite {
     assert(operationCol(row, "errorMessage") == "boom")
     assert(operationCol(row, "statementFile") == "")
     assert(operationCol(row, "durationMs") == "50")
-    assert(operationCol(row, "analyzePhaseMs") == "-1")
+    assert(operationCol(row, "failTime") == "150")
     assert(operationCol(row, "sqlIds") == "")
-    // producedRowCount is an Option[Long]; absent values render as null.
-    assert(operationCol(row, "producedRowCount") == null)
   }
 
   test("ConnectOperationProfileResult derives CANCELED status takes priority over FAILED") {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
index 9eb983164..bf9f11a50 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -153,11 +153,9 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
         // Header + 2 op rows
         assert(opLines.size == 3, s"unexpected operation rows: $opLines")
         assert(opLines.head ==
-          "appID,operationId,sessionId,userId,jobTag,startTime,analyzeTime," +
-            "readyForExecTime,finishTime,closeTime,failTime,cancelTime,durationMs," +
-            "analyzePhaseMs,planPhaseMs,execPhaseMs,resultDeliveryPhaseMs,status," +
-            "producedRowCount,errorMessage,sqlIds,jobIds,statementFile,statementBytes," +
-            "statementTruncated",
+          "appID,operationId,sessionId,userId,jobTag,startTime,finishTime,closeTime," +
+            "failTime,cancelTime,durationMs,status,errorMessage,sqlIds,jobIds," +
+            "statementFile,statementTruncated",
           s"unexpected operation header: ${opLines.head}")
         // Per-column parse: find the `status` column index from the registry and
         // assert exactly one SUCCEEDED and one FAILED row. Rows in this test case
@@ -175,22 +173,12 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
         assert(statusValues.count(_ == "FAILED") == 1,
           s"expected exactly one FAILED row: $statusValues")
 
-        // Task 6: verify sidecar statement files and the statementFile column.
-        // op-1 had non-empty statementText -> sidecar exists and basename appears
-        //   in the CSV. op-2 had empty statementText -> no sidecar and empty cell.
+        // Sidecars are disabled by default. The statementFile column remains
+        // empty until the caller opts into writing sidecars.
         val statementsDir = Paths.get(tmpDir.getAbsolutePath,
           ConnectStatementWriter.SUB_DIR)
-        assert(Files.isDirectory(statementsDir),
-          s"expected $statementsDir directory for op-1 sidecar")
-        val op1Sidecar = statementsDir.resolve("op-1.txt")
-        val op2Sidecar = statementsDir.resolve("op-2.txt")
-        assert(Files.exists(op1Sidecar), s"expected $op1Sidecar to exist")
-        assert(!Files.exists(op2Sidecar),
-          s"expected $op2Sidecar not to exist for empty statementText")
-        val op1Contents = new String(Files.readAllBytes(op1Sidecar),
-          StandardCharsets.UTF_8)
-        assert(op1Contents == op1StatementText,
-          s"sidecar contents mismatch: $op1Contents vs $op1StatementText")
+        assert(!Files.exists(statementsDir),
+          s"expected no $statementsDir when sidecars are disabled")
         val opIdIdx = opHeaders.indexOf("operationId")
         val statementFileIdx = opHeaders.indexOf("statementFile")
         assert(statementFileIdx >= 0,
@@ -202,8 +190,8 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
           val stmtFile = cols(statementFileIdx).stripPrefix("\"").stripSuffix("\"")
           opId -> stmtFile
         }.toMap
-        assert(stmtFileByOp("op-1") == "op-1.txt",
-          s"expected op-1 statementFile=op-1.txt, got ${stmtFileByOp("op-1")}")
+        assert(stmtFileByOp("op-1") == "",
+          s"expected op-1 statementFile empty by default, got ${stmtFileByOp("op-1")}")
         assert(stmtFileByOp("op-2") == "",
           s"expected op-2 statementFile empty, got ${stmtFileByOp("op-2")}")
       } finally {
@@ -273,4 +261,69 @@ class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
       }
     }
   }
+
+  test("writeConnectTables writes statement sidecars only when enabled") {
+    withEventLog(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      val op1StatementText = "SELECT 1 plan body"
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+      app.connectOperations.put("op-1", new ConnectOperationInfo(
+        operationId = "op-1",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1",
+        statementText = op1StatementText,
+        startTime = 110L))
+      app.connectOperations.put("op-2", new ConnectOperationInfo(
+        operationId = "op-2",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-2",
+        statementText = "",
+        startTime = 120L))
+
+      val tmpDir = Files.createTempDirectory("prof-connect-out-").toFile
+      try {
+        val writer = new ProfileOutputWriter(tmpDir.getAbsolutePath, "profile",
+          numOutputRows = 1000, outputCSV = true)
+        try {
+          Profiler.writeConnectTables(writer, app, writeStatementSidecars = true)
+        } finally {
+          writer.close()
+        }
+
+        val opLines = readAllLines(Paths.get(tmpDir.getAbsolutePath, "connect_operations.csv"))
+        val opHeaders = OutHeaderRegistry.outputHeaders("ConnectOperationProfileResult")
+        val opIdIdx = opHeaders.indexOf("operationId")
+        val statementFileIdx = opHeaders.indexOf("statementFile")
+        val stmtFileByOp = opLines.tail.map { line =>
+          val cols = line.split(",", -1)
+          val opId = cols(opIdIdx).stripPrefix("\"").stripSuffix("\"")
+          val stmtFile = cols(statementFileIdx).stripPrefix("\"").stripSuffix("\"")
+          opId -> stmtFile
+        }.toMap
+
+        val statementsDir = Paths.get(tmpDir.getAbsolutePath, ConnectStatementWriter.SUB_DIR)
+        val op1Sidecar = statementsDir.resolve("op-1.txt")
+        val op2Sidecar = statementsDir.resolve("op-2.txt")
+        assert(Files.isDirectory(statementsDir),
+          s"expected $statementsDir directory when sidecars are enabled")
+        assert(Files.exists(op1Sidecar), s"expected $op1Sidecar to exist")
+        assert(!Files.exists(op2Sidecar),
+          s"expected $op2Sidecar not to exist for empty statementText")
+        val op1Contents = new String(Files.readAllBytes(op1Sidecar), StandardCharsets.UTF_8)
+        assert(op1Contents == op1StatementText,
+          s"sidecar contents mismatch: $op1Contents vs $op1StatementText")
+        assert(stmtFileByOp("op-1") == "op-1.txt",
+          s"expected op-1 statementFile=op-1.txt, got ${stmtFileByOp("op-1")}")
+        assert(stmtFileByOp("op-2") == "",
+          s"expected op-2 statementFile empty, got ${stmtFileByOp("op-2")}")
+      } finally {
+        deleteRecursively(tmpDir.toPath)
+      }
+    }
+  }
 }
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
index 68833a7fe..605ef93fc 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationConnectOutputSuite.scala
@@ -81,7 +81,7 @@ class QualificationConnectOutputSuite extends BaseNoSparkSuite {
     }
   }
 
-  test("qualification raw metrics emit connect CSVs and statement sidecars") {
+  test("qualification raw metrics emit connect CSVs and statement sidecars when enabled") {
     withQualificationApp(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
       app.connectSessions.put("sess-1", new ConnectSessionInfo(
         sessionId = "sess-1",
@@ -117,7 +117,8 @@ class QualificationConnectOutputSuite extends BaseNoSparkSuite {
 
       val tmpDir = Files.createTempDirectory("qual-connect-out-")
       try {
-        QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(tmpDir.toString, app)
+        QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(
+          tmpDir.toString, app, writeConnectStatements = true)
 
         val appDir = tmpDir.resolve("raw_metrics").resolve(app.appId)
         val sessionsCsv = appDir.resolve("connect_sessions.csv")
@@ -183,4 +184,34 @@ class QualificationConnectOutputSuite extends BaseNoSparkSuite {
       }
     }
   }
+
+  test("qualification raw metrics do not emit statement sidecars by default") {
+    withQualificationApp(logStartEvent, appStartEvent, envUpdateEvent, appEndEvent) { app =>
+      app.connectSessions.put("sess-1", new ConnectSessionInfo(
+        sessionId = "sess-1",
+        userId = "alice",
+        startTime = 100L,
+        endTime = Some(500L)))
+      app.connectOperations.put("op-1", new ConnectOperationInfo(
+        operationId = "op-1",
+        sessionId = "sess-1",
+        userId = "alice",
+        jobTag = "SparkConnect_OperationTag_User_alice_Session_sess-1_Operation_op-1",
+        statementText = "SELECT 1 plan body",
+        startTime = 110L))
+
+      val tmpDir = Files.createTempDirectory("qual-connect-out-")
+      try {
+        QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(tmpDir.toString, app)
+
+        val appDir = tmpDir.resolve("raw_metrics").resolve(app.appId)
+        assert(Files.exists(appDir.resolve("connect_operations.csv")),
+          s"expected connect_operations.csv under $appDir")
+        assert(!Files.exists(appDir.resolve("connect_statements")),
+          s"expected no connect_statements directory under $appDir by default")
+      } finally {
+        deleteRecursively(tmpDir)
+      }
+    }
+  }
 }
diff --git a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
index 1ec1c434e..ad07d9a14 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
@@ -88,6 +88,7 @@ class Meta(ResultHandlerBaseMeta):    # pylint: disable=too-few-public-methods
     readers: Dict[str, ToolReportReader]
     logger: Optional[Logger] = field(default=None)
     app_handlers: Dict[str, AppHandler] = field(default_factory=dict, init=False)
+    _connect_statement_unsafe_chars = re.compile(r'[^A-Za-z0-9._-]')
 
     def __post_init__(self):
         # init the logger if it is not defined
@@ -252,6 +253,13 @@ def get_connect_statements_dir(self, app_id: str) -> Optional[BoundedCspPath]:
             return None
         return stmt_dir
 
+    @classmethod
+    def _sanitize_connect_operation_id(cls, operation_id: str) -> str:
+        """
+        Sanitize operation IDs to the on-disk basename convention used by the Scala writer.
+        """
+        return cls._connect_statement_unsafe_chars.sub('_', operation_id)
+
     def list_connect_statement_ops(self, app_id: str) -> List[str]:
         """
         Return sorted operation IDs for all statement sidecars under connect_statements/.
@@ -274,7 +282,8 @@ def load_connect_statement(self, app_id: str, operation_id: str) -> Optional[str
         stmt_dir = self.get_connect_statements_dir(app_id)
         if stmt_dir is None:
             return None
-        sub_path = stmt_dir.create_sub_path(f'{operation_id}.txt')
+        safe_operation_id = self._sanitize_connect_operation_id(operation_id)
+        sub_path = stmt_dir.create_sub_path(f'{safe_operation_id}.txt')
         if not sub_path.exists():
             return None
         txt_res = DataUtils.load_txt(sub_path)
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
index 117ac5c9e..1ec89f338 100644
--- a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_e2e.py
@@ -31,10 +31,8 @@ class TestConnectE2E(unittest.TestCase):
     expected_statement = 'common { plan_id: 0 } range { start: 0 end: 100 step: 1 }\n'
     expected_operation_columns = [
         'appID', 'operationId', 'sessionId', 'userId', 'jobTag', 'startTime',
-        'analyzeTime', 'readyForExecTime', 'finishTime', 'closeTime', 'failTime',
-        'cancelTime', 'durationMs', 'analyzePhaseMs', 'planPhaseMs', 'execPhaseMs',
-        'resultDeliveryPhaseMs', 'status', 'producedRowCount', 'errorMessage',
-        'sqlIds', 'jobIds', 'statementFile', 'statementBytes', 'statementTruncated'
+        'finishTime', 'closeTime', 'failTime', 'cancelTime', 'durationMs', 'status',
+        'errorMessage', 'sqlIds', 'jobIds', 'statementFile', 'statementTruncated'
     ]
 
     def setUp(self):
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
index 3e70ec6c0..bd1cf5648 100644
--- a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
@@ -42,6 +42,8 @@ def setUp(self):
             fh.write('SELECT 1')
         with open(os.path.join(self.statements_dir, 'op-2.txt'), 'w', encoding='utf-8') as fh:
             fh.write('SELECT 2')
+        with open(os.path.join(self.app_dir, 'secret.txt'), 'w', encoding='utf-8') as fh:
+            fh.write('SECRET')
 
     def tearDown(self):
         shutil.rmtree(self.temp_dir, ignore_errors=True)
@@ -66,3 +68,7 @@ def test_load_connect_statement_missing_returns_none(self):
         handler = ProfCore(self.prof_output)
         self.assertIsNone(handler.load_connect_statement(self.sample_app_id, 'missing-op'))
         self.assertIsNone(handler.load_connect_statement('missing-app', 'op-1'))
+
+    def test_load_connect_statement_sanitizes_operation_id_before_reading(self):
+        handler = ProfCore(self.prof_output)
+        self.assertIsNone(handler.load_connect_statement(self.sample_app_id, '../secret'))
diff --git a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
index 34dc424fb..478916abf 100644
--- a/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
+++ b/user_tools/tests/spark_rapids_tools_ut/resources/connect_e2e/rapids_4_spark_profile/local-connect-e2e/connect_operations.csv
@@ -1,3 +1,3 @@
-appID,operationId,sessionId,userId,jobTag,startTime,analyzeTime,readyForExecTime,finishTime,closeTime,failTime,cancelTime,durationMs,analyzePhaseMs,planPhaseMs,execPhaseMs,resultDeliveryPhaseMs,status,producedRowCount,errorMessage,sqlIds,jobIds,statementFile,statementBytes,statementTruncated
-local-connect-e2e,op-bbb-222,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-bbb-222,120000,121000,121500,125000,125500,,,5500,1000,500,3500,500,SUCCEEDED,10,,42,7,op-bbb-222.txt,57,false
-local-connect-e2e,op-ccc-333,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-ccc-333,130000,,,,,131000,,1000,-1,-1,-1,-1,FAILED,,boom,,,,0,false
+appID,operationId,sessionId,userId,jobTag,startTime,finishTime,closeTime,failTime,cancelTime,durationMs,status,errorMessage,sqlIds,jobIds,statementFile,statementTruncated
+local-connect-e2e,op-bbb-222,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-bbb-222,120000,125000,125500,,,5500,SUCCEEDED,,42,7,op-bbb-222.txt,false
+local-connect-e2e,op-ccc-333,sess-aaa-111,userA,SparkConnect_OperationTag_User_userA_Session_sess-aaa-111_Operation_op-ccc-333,130000,,,131000,,1000,FAILED,boom,,,,false

From ff0ffda6317d4da6d7a4171044b09a4a463c5c03 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 22 Apr 2026 15:08:11 -0700
Subject: [PATCH 16/19] test(connect): clarify truncation marker semantics

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../resources/configs/reports/connectReport.yaml |  6 +++++-
 .../tool/profiling/ConnectProfileResults.scala   |  5 +++++
 .../profiling/ConnectProfileResultsSuite.scala   | 16 ++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/core/src/main/resources/configs/reports/connectReport.yaml b/core/src/main/resources/configs/reports/connectReport.yaml
index 7205d6791..dc7314de8 100644
--- a/core/src/main/resources/configs/reports/connectReport.yaml
+++ b/core/src/main/resources/configs/reports/connectReport.yaml
@@ -106,7 +106,11 @@ reportDefinitions:
             description: Sidecar filename under connect_statements/ for this operation, if written.
           - name: statementTruncated
             dataType: Boolean
-            description: True when statementText includes Spark's truncation marker.
+            description: >-
+              True when statementText includes Spark's textual truncation
+              marker. This does not detect Spark 4.x depth-based subtree
+              elision, which collapses nested structures to {} without a
+              marker.
       - label: connectStatements
         description: >-
           Directory of per-operation statementText sidecars. Each
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
index ec9163dba..4d3916fa1 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
@@ -146,6 +146,11 @@ object ConnectOperationProfileResult {
    * a statement/plan text exceeds its configured limit. Presence of this marker
    * in `statementText` indicates the artifact we persist is a truncated
    * representation of the original plan.
+   *
+   * Note: Spark 4.x also performs depth-based structural elision by collapsing
+   * subtrees to `{}` once the protobuf text formatter exceeds its nesting cap.
+   * Those cases do not emit a textual marker, so `statementTruncated` is
+   * intentionally limited to marker-based truncation only.
    */
   private[profiling] val TruncationMarker: String = "[truncated(size="
 
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
index edb88ea98..a93f4a8b8 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResultsSuite.scala
@@ -154,6 +154,22 @@ class ConnectProfileResultsSuite extends AnyFunSuite {
     assert(operationCol(row, "statementTruncated") == "true")
   }
 
+  test("ConnectOperationProfileResult does not infer truncation from structural elision") {
+    val op = new ConnectOperationInfo(
+      operationId = "op-struct", sessionId = "s", userId = "u",
+      jobTag = "tag",
+      statementText =
+        """filter {
+          |  input {
+          |    common {}
+          |    join {}
+          |  }
+          |}""".stripMargin,
+      startTime = 100L)
+    val row = ConnectOperationProfileResult.from("app", op, Seq.empty, Seq.empty, None)
+    assert(operationCol(row, "statementTruncated") == "false")
+  }
+
   test("ConnectOperationProfileResult.convertToCSVSeq wraps string fields in quotes") {
     val op = new ConnectOperationInfo(
       operationId = "op,x", sessionId = "s\"q", userId = "u1",

From ed5a4cfd2da13474ca0614b9f25d97c067b7fe17 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 22 Apr 2026 15:22:12 -0700
Subject: [PATCH 17/19] fix(connect): address greptile review feedback

- ConnectStatementWriter: use Hadoop FileSystem instead of java.nio.file
  so sidecar writes work for HDFS/S3/GCS, not just local paths
- Profiler.writeConnectTables: precompute opCountBySession map to avoid
  O(sessions * operations) scan when emitting connect_sessions.csv
- EventProcessorBase: filter empty tokens when splitting spark.job.tags
- Thread hadoopConf from Qualification/Profiler through
  QualRawReportGenerator to the sidecar writer
---
 .../profiling/ConnectStatementWriter.scala    | 49 +++++++++++++------
 .../rapids/tool/profiling/Profiler.scala      | 16 ++++--
 .../tool/qualification/Qualification.scala    |  2 +-
 .../tool/views/QualRawReportGenerator.scala   |  6 ++-
 .../sql/rapids/tool/EventProcessorBase.scala  |  2 +-
 5 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
index bbb9ee731..78aed7fe8 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
@@ -17,7 +17,9 @@
 package com.nvidia.spark.rapids.tool.profiling
 
 import java.nio.charset.StandardCharsets
-import java.nio.file.{Files, Paths}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
 
@@ -33,6 +35,9 @@ import org.apache.spark.internal.Logging
  * non-empty statement, so apps with no statements at all do not produce an
  * empty directory. Per-file IO errors are logged and skipped; they do not
  * abort the batch.
+ *
+ * Writes go through Hadoop `FileSystem` so the same code works for local
+ * paths, HDFS, S3, GCS, etc. — matching every other per-app output file.
  */
 object ConnectStatementWriter extends Logging {
 
@@ -48,31 +53,45 @@ object ConnectStatementWriter extends Logging {
    * Writes each operation's `statementText` to
    * `<rootDir>/connect_statements/<operationId>.txt` when non-empty.
    *
-   * @param rootDir per-app output directory (already exists)
-   * @param ops     operations to persist
-   * @return map of `operationId -> "<operationId>.txt"` basenames for the
-   *         operations whose sidecar file was written successfully.
+   * @param rootDir    per-app output directory (already exists)
+   * @param ops        operations to persist
+   * @param hadoopConf Hadoop configuration used to resolve the target
+   *                   filesystem. When `None`, a fresh `Configuration` is
+   *                   used (which resolves the default filesystem, typically
+   *                   local).
+   * @return map of `operationId -> "<sanitizedOperationId>.txt"` basenames for
+   *         the operations whose sidecar file was written successfully.
    */
   def writeStatementFiles(
       rootDir: String,
-      ops: Iterable[ConnectOperationInfo]): Map[String, String] = {
-    val subDirPath = Paths.get(rootDir, SUB_DIR).toAbsolutePath.normalize()
+      ops: Iterable[ConnectOperationInfo],
+      hadoopConf: Option[Configuration] = None): Map[String, String] = {
+    val conf = hadoopConf.getOrElse(new Configuration())
+    val subDirPath = new Path(rootDir, SUB_DIR)
+    val fs = subDirPath.getFileSystem(conf)
     var subDirCreated = false
     val builder = Map.newBuilder[String, String]
     ops.foreach { op =>
       val text = op.statementText
       if (text.nonEmpty) {
         try {
-          if (!subDirCreated) {
-            Files.createDirectories(subDirPath)
-            subDirCreated = true
-          }
           val safeId = sanitizeOperationId(op.operationId)
           val basename = s"$safeId$FILE_EXTENSION"
-          val target = subDirPath.resolve(basename).normalize()
-          require(target.startsWith(subDirPath),
+          val target = new Path(subDirPath, basename)
+          // Defense-in-depth containment check. Sanitization already removes
+          // `/` and `..`, but verify the resolved parent matches.
+          require(target.getParent == subDirPath,
             s"Refusing to write Connect statement sidecar outside $subDirPath: $target")
-          Files.write(target, text.getBytes(StandardCharsets.UTF_8))
+          if (!subDirCreated) {
+            fs.mkdirs(subDirPath)
+            subDirCreated = true
+          }
+          val out = fs.create(target, /* overwrite = */ true)
+          try {
+            out.write(text.getBytes(StandardCharsets.UTF_8))
+          } finally {
+            out.close()
+          }
           builder += (op.operationId -> basename)
         } catch {
           case e: Exception =>
@@ -83,4 +102,4 @@ object ConnectStatementWriter extends Logging {
     }
     builder.result()
   }
-}
+}
\ No newline at end of file
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index a25ca7109..6d0730370 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -418,7 +418,8 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea
     profileOutputWriter.writeTable(ProfRemovedBLKMgrView.getLabel, app.removedBMs)
     profileOutputWriter.writeCSVTable(ProfRemovedExecutorView.getLabel, app.removedExecutors)
     profileOutputWriter.writeCSVTable("Unsupported SQL Plan", app.unsupportedOps)
-    Profiler.writeConnectTables(profileOutputWriter, profilerResult.app, writeConnectStatements)
+    Profiler.writeConnectTables(profileOutputWriter, profilerResult.app,
+      writeConnectStatements, Some(hadoopConf))
     if (outputAlignedSQLIds) {
       profileOutputWriter.writeTable(
         ProfSQLPlanAlignedView.getLabel, app.sqlCleanedAlignedIds,
@@ -496,9 +497,16 @@ object Profiler {
   def writeConnectTables(
       writer: ProfileOutputWriter,
       app: AppBase,
-      writeStatementSidecars: Boolean = false): Unit = {
+      writeStatementSidecars: Boolean = false,
+      hadoopConf: Option[Configuration] = None): Unit = {
     if (!app.isConnectMode) return
     val appId = app.appId
+    // Group once so the per-session operation count is O(operations) overall
+    // instead of O(sessions * operations).
+    val opCountBySession: Map[String, Long] =
+      app.connectOperations.values.groupBy(_.sessionId).map { case (sid, ops) =>
+        sid -> ops.size.toLong
+      }
     val sessionRows = app.connectSessions.values.toSeq.sortBy(_.sessionId).map { s =>
       ConnectSessionProfileResult(
         appId = appId,
@@ -506,13 +514,13 @@ object Profiler {
         userId = s.userId,
         startTime = s.startTime,
         endTime = s.endTime,
-        operationCount = app.connectOperations.values.count(_.sessionId == s.sessionId).toLong)
+        operationCount = opCountBySession.getOrElse(s.sessionId, 0L))
     }
     writer.writeCSVTable("Connect Sessions", sessionRows)
     val statementFiles: Map[String, String] =
       if (writeStatementSidecars) {
         ConnectStatementWriter.writeStatementFiles(
-          writer.outputDir, app.connectOperations.values)
+          writer.outputDir, app.connectOperations.values, hadoopConf)
       } else {
         Map.empty
       }
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
index 78784c39a..249f4e470 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
@@ -156,7 +156,7 @@ class Qualification(
           val dsInfo =
             AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
               QualRawReportGenerator.generateRawMetricQualViewAndGetDataSourceInfo(
-                outputDir, app, writeConnectStatements)
+                outputDir, app, writeConnectStatements, Some(hadoopConf))
             }.getOrElse(Seq.empty)
           val qualSumInfo = app.aggregateStats()
           AppSubscriber.withSafeValidAttempt(app.appId, app.attemptId) { () =>
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
index d4df4a3a1..80ed0587a 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala
@@ -18,6 +18,7 @@ package com.nvidia.spark.rapids.tool.views
 
 import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator}
 import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, Profiler, ProfileResult, SQLAccumProfileResults}
+import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
@@ -65,7 +66,8 @@ object QualRawReportGenerator extends Logging {
   def generateRawMetricQualViewAndGetDataSourceInfo(
       rootDir: String,
       app: QualificationAppInfo,
-      writeConnectStatements: Boolean = false): Seq[DataSourceProfileResult] = {
+      writeConnectStatements: Boolean = false,
+      hadoopConf: Option[Configuration] = None): Seq[DataSourceProfileResult] = {
     val metricsDirectory = s"$rootDir/raw_metrics/${app.appId}"
     val sqlPlanAnalyzer = AppSQLPlanAnalyzer(app)
     var dataSourceInfo: Seq[DataSourceProfileResult] = Seq.empty
@@ -114,7 +116,7 @@ object QualRawReportGenerator extends Logging {
         QualRemovedExecutorView.getLabel, QualRemovedExecutorView.getRawView(Seq(app)))
       // we only need to write the CSV report of the WriteOps
       pWriter.writeCSVTable(QualWriteOpsView.getLabel, QualWriteOpsView.getRawView(Seq(app)))
-      Profiler.writeConnectTables(pWriter, app, writeConnectStatements)
+      Profiler.writeConnectTables(pWriter, app, writeConnectStatements, hadoopConf)
     } catch {
       case e: Exception =>
         logError(s"Error generating raw metrics for ${app.appId}: ${e.getMessage}")
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
index b554d184c..14fdd4d54 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/EventProcessorBase.scala
@@ -435,7 +435,7 @@ abstract class EventProcessorBase[T <: AppBase](app: T) extends SparkListener wi
     if (app.isConnectMode) {
       val tagStr = event.properties.getProperty("spark.job.tags")
       if (tagStr != null && tagStr.nonEmpty) {
-        tagStr.split(",").iterator.map(_.trim).foreach { tag =>
+        tagStr.split(",").iterator.map(_.trim).filter(_.nonEmpty).foreach { tag =>
           app.jobTagToConnectOpId.get(tag).foreach { opId =>
             app.operationIdToJobIds
               .getOrElseUpdate(opId, mutable.HashSet.empty[Int])

From 623b46477689b743f8930907734644a800f7cd56 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Wed, 22 Apr 2026 16:03:41 -0700
Subject: [PATCH 18/19] fix(connect): scalastyle, scaladoc, and audit cleanups

- ConnectStatementWriter: route writes through ToolTextFileWriter and
  ensure trailing newline, resolving both scalastyle errors flagged by
  pre-merge CI.
- Fix scaladoc link warnings by using fully qualified names for
  ConnectSessionInfo / ConnectOperationInfo / OutHeaderRegistry and
  drop the @throws doc tag on EventUtils (annotation is retained).
- Correct stale sidecar path in ConnectProfileResults doc.
- Rewrite Connect test suite headers to remove agent-voice /
  incremental-step framing.
- Tighten list_connect_statement_ops docstring in result_handler.
- Expose get_table_path / get_per_app_table_path on APIResHandler and
  cover them with test_connect_helpers tests for wrapper/core handlers.
---
 .../profiling/ConnectProfileResults.scala     | 10 ++--
 .../profiling/ConnectStatementWriter.scala    | 27 +++++------
 .../tool/util/ConnectEventHandler.scala       |  3 +-
 .../sql/rapids/tool/util/EventUtils.scala     |  2 +-
 .../profiling/ConnectCorrelationSuite.scala   | 11 +++--
 .../ConnectProfilerOutputSuite.scala          |  8 ++--
 .../ConnectStatementWriterSuite.scala         |  7 ++-
 .../src/spark_rapids_tools/api_v1/builder.py  |  6 +++
 .../api_v1/result_handler.py                  | 16 +++++--
 .../api/test_connect_helpers.py               | 46 +++++++++++++++----
 10 files changed, 88 insertions(+), 48 deletions(-)

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
index 4d3916fa1..1ef0e3e4f 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfileResults.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.rapids.tool.util.StringUtils
 /**
  * CSV row for a Spark Connect session. Serializes the lifecycle metadata for a
  * single session into the columns registered under
- * `ConnectSessionProfileResult` in [[OutHeaderRegistry]].
+ * `ConnectSessionProfileResult` in [[com.nvidia.spark.rapids.tool.views.OutHeaderRegistry]].
  *
  * `durationMs` is `endTime - startTime` when `endTime` is defined, else `-1`
  * (matches the convention used for open/unfinished sessions in other result
@@ -67,11 +67,11 @@ case class ConnectSessionProfileResult(
 /**
  * CSV row for a single Spark Connect operation. Captures the core lifecycle
  * (start/finish/close/fail/cancel timestamps), derived status, error message,
- * and the joined sqlIDs/jobIDs. Also captures statement-file provenance for
- * the separate `statements/<operationId>.txt` artifact.
+ * and the joined sqlIDs/jobIDs. Also records the sidecar basename for the
+ * `connect_statements/<operationId>.txt` artifact.
  *
- * sqlIds and jobIds are serialized semicolon-separated (to keep the CSV
- * single-column and avoid quoting issues).
+ * sqlIds and jobIds are serialized semicolon-separated to keep the CSV
+ * single-column and avoid quoting issues.
  */
 case class ConnectOperationProfileResult(
     appId: String,
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
index 78aed7fe8..1560eeb4a 100644
--- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
+++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriter.scala
@@ -16,8 +16,7 @@
 
 package com.nvidia.spark.rapids.tool.profiling
 
-import java.nio.charset.StandardCharsets
-
+import com.nvidia.spark.rapids.tool.ToolTextFileWriter
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
@@ -36,8 +35,9 @@ import org.apache.spark.internal.Logging
  * empty directory. Per-file IO errors are logged and skipped; they do not
  * abort the batch.
  *
- * Writes go through Hadoop `FileSystem` so the same code works for local
- * paths, HDFS, S3, GCS, etc. — matching every other per-app output file.
+ * Writes go through [[ToolTextFileWriter]] so the same UTF-8, permissions, and
+ * local/raw-filesystem behavior used by the rest of the tools output applies
+ * here as well.
  */
 object ConnectStatementWriter extends Logging {
 
@@ -66,10 +66,7 @@ object ConnectStatementWriter extends Logging {
       rootDir: String,
       ops: Iterable[ConnectOperationInfo],
       hadoopConf: Option[Configuration] = None): Map[String, String] = {
-    val conf = hadoopConf.getOrElse(new Configuration())
     val subDirPath = new Path(rootDir, SUB_DIR)
-    val fs = subDirPath.getFileSystem(conf)
-    var subDirCreated = false
     val builder = Map.newBuilder[String, String]
     ops.foreach { op =>
       val text = op.statementText
@@ -82,15 +79,15 @@ object ConnectStatementWriter extends Logging {
           // `/` and `..`, but verify the resolved parent matches.
           require(target.getParent == subDirPath,
             s"Refusing to write Connect statement sidecar outside $subDirPath: $target")
-          if (!subDirCreated) {
-            fs.mkdirs(subDirPath)
-            subDirCreated = true
-          }
-          val out = fs.create(target, /* overwrite = */ true)
+          val writer = new ToolTextFileWriter(
+            subDirPath.toString,
+            basename,
+            s"Connect statement sidecar for operation ${op.operationId}",
+            hadoopConf)
           try {
-            out.write(text.getBytes(StandardCharsets.UTF_8))
+            writer.write(text)
           } finally {
-            out.close()
+            writer.close()
           }
           builder += (op.operationId -> basename)
         } catch {
@@ -102,4 +99,4 @@ object ConnectStatementWriter extends Logging {
     }
     builder.result()
   }
-}
\ No newline at end of file
+}
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ConnectEventHandler.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ConnectEventHandler.scala
index ead26780c..8f23f7a14 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ConnectEventHandler.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/ConnectEventHandler.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.rapids.tool.AppBase
  * directly imported to preserve compatibility with older Spark profiles.
  * Events are identified by class-name prefix, fields are extracted via
  * cached reflective accessors in [[EventUtils]], and results are stored
- * in [[ConnectSessionInfo]] and [[ConnectOperationInfo]].
+ * in [[com.nvidia.spark.rapids.tool.profiling.ConnectSessionInfo]] and
+ * [[com.nvidia.spark.rapids.tool.profiling.ConnectOperationInfo]].
  */
 object ConnectEventHandler extends Logging {
 
diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
index ece78b5fe..d770e9e62 100644
--- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
+++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/util/EventUtils.scala
@@ -365,7 +365,7 @@ object EventUtils extends Logging {
 
   /**
    * Invoke a no-arg method on an object, caching the Method reference.
-   * @throws NoSuchMethodException if the method does not exist on the object's class.
+   * Throws NoSuchMethodException if the method does not exist on the object's class.
    */
   @throws[NoSuchMethodException]
   def invokeMethodOnEvent(event: AnyRef, methodName: String): Any = {
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
index 84c8a84b9..b6ac574c3 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectCorrelationSuite.scala
@@ -32,11 +32,12 @@ import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
 import org.apache.spark.sql.rapids.tool.util.RapidsToolsConfUtil
 
 /**
- * Tests for Spark Connect sqlID/jobID correlation indexes on AppBase.
- * Task 1 verifies the reverse-index HashMaps exist and are initialized
- * empty on a fresh app. Task 2 verifies operationIdToSqlIds is populated
- * from SparkListenerSQLExecutionStart.jobTags. Task 3 will populate
- * operationIdToJobIds from SparkListenerJobStart.properties["spark.job.tags"].
+ * Tests for Spark Connect sqlID/jobID correlation indexes on `AppBase`.
+ * Covers:
+ *  - the reverse-index `HashMap`s exist and are initialized empty on a fresh app,
+ *  - `operationIdToSqlIds` is populated from `SparkListenerSQLExecutionStart.jobTags`,
+ *  - `operationIdToJobIds` is populated from
+ *    `SparkListenerJobStart.properties["spark.job.tags"]`.
  */
 class ConnectCorrelationSuite extends BaseNoSparkSuite {
 
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
index bf9f11a50..d3ea099eb 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectProfilerOutputSuite.scala
@@ -31,10 +31,10 @@ import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
 import org.apache.spark.sql.rapids.tool.util.{RapidsToolsConfUtil, UTF8Source}
 
 /**
- * Tests Task 5 of Spark Connect Phase 3 (#2065): wiring
- * [[ConnectSessionProfileResult]] / [[ConnectOperationProfileResult]] into the
- * Profiler's per-app CSV output. Verifies that `connect_sessions.csv` and
- * `connect_operations.csv` are produced in Connect mode and absent otherwise.
+ * Tests that [[ConnectSessionProfileResult]] and
+ * [[ConnectOperationProfileResult]] are wired into the Profiler's per-app CSV
+ * output — `connect_sessions.csv` and `connect_operations.csv` are produced in
+ * Connect mode and absent otherwise.
  */
 class ConnectProfilerOutputSuite extends BaseNoSparkSuite {
 
diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
index cd703fa03..b16612eb6 100644
--- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
+++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ConnectStatementWriterSuite.scala
@@ -23,10 +23,9 @@ import java.util.Comparator
 import org.scalatest.funsuite.AnyFunSuite
 
 /**
- * Tests Task 6 of Spark Connect Phase 3 (#2065): writing each operation's
- * `statementText` to a sidecar file under
- * `<per-app-output-dir>/connect_statements/<operationId>.txt` and returning
- * the basename map used to populate the `statementFile` column in
+ * Tests for [[ConnectStatementWriter]]: writes each operation's `statementText`
+ * to a sidecar at `<per-app-output-dir>/connect_statements/<operationId>.txt`
+ * and returns the basename map used to populate the `statementFile` column in
  * `connect_operations.csv`.
  */
 class ConnectStatementWriterSuite extends AnyFunSuite {
diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 3c5dcedcc..9dcf79211 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -1169,6 +1169,12 @@ def out_path(self) -> Optional[BoundedCspPath]:
     def is_empty(self) -> bool:
         return self._res_h.is_empty()
 
+    def get_table_path(self, table_label: str) -> Optional[BoundedCspPath]:
+        return self._res_h.get_table_path(table_label)
+
+    def get_per_app_table_path(self, table_label: str, app_id: str) -> Optional[BoundedCspPath]:
+        return self._res_h.get_per_app_table_path(table_label, app_id)
+
     def get_raw_metrics_path(self) -> Optional[BoundedCspPath]:
         return self._res_h.get_raw_metrics_path()
 
diff --git a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
index ad07d9a14..2e86b7d06 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
@@ -227,7 +227,7 @@ def is_empty(self) -> bool:
     def get_raw_metrics_path(self) -> Optional[BoundedCspPath]:
         return self.get_reader_path('coreRawMetrics')
 
-    def _get_per_app_table_path(self, table_label: str, app_id: str) -> Optional[BoundedCspPath]:
+    def get_per_app_table_path(self, table_label: str, app_id: str) -> Optional[BoundedCspPath]:
         """
         Resolve the per-application path for a table definition.
         :param table_label: Label of the table definition.
@@ -248,7 +248,7 @@ def get_connect_statements_dir(self, app_id: str) -> Optional[BoundedCspPath]:
         """
         Return the connect_statements directory for a given application, if present.
         """
-        stmt_dir = self._get_per_app_table_path('connectStatements', app_id)
+        stmt_dir = self.get_per_app_table_path('connectStatements', app_id)
         if stmt_dir is None or not stmt_dir.exists():
             return None
         return stmt_dir
@@ -262,7 +262,17 @@ def _sanitize_connect_operation_id(cls, operation_id: str) -> str:
 
     def list_connect_statement_ops(self, app_id: str) -> List[str]:
         """
-        Return sorted operation IDs for all statement sidecars under connect_statements/.
+        List the sanitized operation IDs for all statement sidecars of an app.
+
+        Each file under ``<raw_metrics>/<app_id>/connect_statements/*.txt`` contributes
+        one entry (``.txt`` stripped). Operation IDs are sanitized to match the on-disk
+        basename: characters outside ``[A-Za-z0-9._-]`` are replaced with ``_``. Use
+        ``connect_operations.csv`` and its ``statementFile`` column to recover the
+        original operation IDs.
+
+        :param app_id: Spark application ID whose sidecar directory should be listed.
+        :return: Sorted list of sanitized operation IDs, or an empty list when no
+                 ``connect_statements/`` directory exists for the app.
         """
         stmt_dir = self.get_connect_statements_dir(app_id)
         if stmt_dir is None:
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
index bd1cf5648..c704d5e5a 100644
--- a/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_connect_helpers.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for Connect-specific ResultHandler helpers."""
+"""Tests for generic artifact-path lookups and Connect-specific helpers."""
 
 import os
 import shutil
 import tempfile
 import unittest
 
-from spark_rapids_tools.api_v1 import ProfCore
+from spark_rapids_tools.api_v1 import ProfCore, ProfWrapper
 
 
 class TestConnectHelpers(unittest.TestCase):
@@ -30,18 +30,27 @@ class TestConnectHelpers(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.mkdtemp()
         self.prof_output = os.path.join(self.temp_dir, 'rapids_4_spark_profile')
+        self.prof_wrapper_output = os.path.join(self.temp_dir, 'prof_20260422010101_deadbeef')
+        self.prof_wrapper_core_output = os.path.join(self.prof_wrapper_output, 'rapids_4_spark_profile')
         self.app_dir = os.path.join(self.prof_output, self.sample_app_id)
         self.statements_dir = os.path.join(self.app_dir, 'connect_statements')
         os.makedirs(self.statements_dir, exist_ok=True)
+        os.makedirs(os.path.join(
+            self.prof_wrapper_core_output,
+            self.sample_app_id,
+            'connect_statements'
+        ), exist_ok=True)
 
-        with open(os.path.join(self.prof_output, 'profiling_status.csv'), 'w', encoding='utf-8') as fh:
-            fh.write('Event Log,Status,App ID,Attempt ID,App Name,Description\n')
-            fh.write(f'/path/to/eventlog,SUCCESS,{self.sample_app_id},0,ProfTest,ok\n')
-
-        with open(os.path.join(self.statements_dir, 'op-1.txt'), 'w', encoding='utf-8') as fh:
-            fh.write('SELECT 1')
-        with open(os.path.join(self.statements_dir, 'op-2.txt'), 'w', encoding='utf-8') as fh:
-            fh.write('SELECT 2')
+        for base_dir in (self.prof_output, self.prof_wrapper_core_output):
+            app_dir = os.path.join(base_dir, self.sample_app_id)
+            statements_dir = os.path.join(app_dir, 'connect_statements')
+            with open(os.path.join(base_dir, 'profiling_status.csv'), 'w', encoding='utf-8') as fh:
+                fh.write('Event Log,Status,App ID,Attempt ID,App Name,Description\n')
+                fh.write(f'/path/to/eventlog,SUCCESS,{self.sample_app_id},0,ProfTest,ok\n')
+            with open(os.path.join(statements_dir, 'op-1.txt'), 'w', encoding='utf-8') as fh:
+                fh.write('SELECT 1')
+            with open(os.path.join(statements_dir, 'op-2.txt'), 'w', encoding='utf-8') as fh:
+                fh.write('SELECT 2')
         with open(os.path.join(self.app_dir, 'secret.txt'), 'w', encoding='utf-8') as fh:
             fh.write('SECRET')
 
@@ -54,6 +63,23 @@ def test_get_connect_statements_dir_returns_per_app_path(self):
         self.assertIsNotNone(path)
         self.assertEqual(path.base_name(), 'connect_statements')
 
+    def test_get_per_app_table_path_returns_connect_directory_path(self):
+        handler = ProfCore(self.prof_output)
+        path = handler.get_per_app_table_path('connectStatements', self.sample_app_id)
+        self.assertIsNotNone(path)
+        self.assertEqual(path.base_name(), 'connect_statements')
+        self.assertTrue(str(path).endswith(f'/{self.sample_app_id}/connect_statements'))
+
+    def test_get_table_path_resolves_nested_core_artifacts_from_wrapper(self):
+        handler = ProfWrapper(self.prof_wrapper_output)
+        status_path = handler.get_table_path('coreCSVStatus')
+        stmt_dir = handler.get_per_app_table_path('connectStatements', self.sample_app_id)
+        self.assertIsNotNone(status_path)
+        self.assertIsNotNone(stmt_dir)
+        self.assertTrue(str(status_path).endswith('/rapids_4_spark_profile/profiling_status.csv'))
+        self.assertTrue(str(stmt_dir).endswith(
+            f'/rapids_4_spark_profile/{self.sample_app_id}/connect_statements'))
+
     def test_list_connect_statement_ops_returns_sorted_operation_ids(self):
         handler = ProfCore(self.prof_output)
         ops = handler.list_connect_statement_ops(self.sample_app_id)

From 04de3cc006efa1230036ca78d23745ab784d5628 Mon Sep 17 00:00:00 2001
From: Sayed Bilal Bari <sbari@nvidia.com>
Date: Fri, 1 May 2026 16:11:54 -0700
Subject: [PATCH 19/19] fix: add Spark Connect runtime dependencies

Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com>
---
 .../resources/databricks_aws-configs.json     | 33 +++++++++++++++++++
 .../resources/databricks_azure-configs.json   | 33 +++++++++++++++++++
 .../resources/dataproc-configs.json           | 33 +++++++++++++++++++
 .../resources/dataproc_gke-configs.json       | 22 +++++++++++++
 .../resources/emr-configs.json                | 33 +++++++++++++++++++
 .../resources/onprem-configs.json             | 33 +++++++++++++++++++
 6 files changed, 187 insertions(+)

diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
index a8573ccaa..e45ff012f 100644
--- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json
@@ -19,6 +19,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+                },
+                "size": 14147901
+              }
+            },
             {
               "name": "Hadoop AWS",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
@@ -58,6 +69,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.6/spark-connect_2.12-3.5.6.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "91c16f5383ba28f96f01b92a8553433bc1df0f67"
+                },
+                "size": 14147901
+              }
+            },
             {
               "name": "Hadoop AWS",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
@@ -97,6 +119,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "6a514b94478fbb86217162211991a17612d32a15"
+                },
+                "size": 17177871
+              }
+            },
             {
               "name": "Hadoop AWS",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
index de51a509b..a9a058e63 100644
--- a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json
@@ -19,6 +19,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+                },
+                "size": 14147901
+              }
+            },
             {
               "name": "Hadoop Azure",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
@@ -47,6 +58,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.6/spark-connect_2.12-3.5.6.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "91c16f5383ba28f96f01b92a8553433bc1df0f67"
+                },
+                "size": 14147901
+              }
+            },
             {
               "name": "Hadoop Azure",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
@@ -75,6 +97,17 @@
                 "relativePath": "jars/*"
               }
             },
+            {
+              "name": "Apache Spark Connect",
+              "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+              "verification": {
+                "fileHash": {
+                  "algorithm": "sha1",
+                  "value": "6a514b94478fbb86217162211991a17612d32a15"
+                },
+                "size": 17177871
+              }
+            },
             {
               "name": "Hadoop Azure",
               "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
index 3a4e39689..a9ffe2829 100644
--- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
@@ -19,6 +19,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+              },
+              "size": 14147901
+            }
+          },
           {
             "name": "GCS Connector Hadoop3",
             "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.14/gcs-connector-hadoop3-2.2.14-shaded.jar",
@@ -47,6 +58,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.6/spark-connect_2.12-3.5.6.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "91c16f5383ba28f96f01b92a8553433bc1df0f67"
+              },
+              "size": 14147901
+            }
+          },
           {
             "name": "GCS Connector Hadoop3",
             "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.14/gcs-connector-hadoop3-2.2.14-shaded.jar",
@@ -75,6 +97,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "6a514b94478fbb86217162211991a17612d32a15"
+              },
+              "size": 17177871
+            }
+          },
           {
             "name": "GCS Connector Hadoop3",
             "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json
index ee3779027..1f396d119 100644
--- a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json
@@ -19,6 +19,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+              },
+              "size": 14147901
+            }
+          },
           {
             "name": "GCS Connector Hadoop3",
             "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.14/gcs-connector-hadoop3-2.2.14-shaded.jar",
@@ -47,6 +58,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "6a514b94478fbb86217162211991a17612d32a15"
+              },
+              "size": 17177871
+            }
+          },
           {
             "name": "GCS Connector Hadoop3",
             "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
index cf651c4b3..7a69e98a3 100644
--- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json
@@ -19,6 +19,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+              },
+              "size": 14147901
+            }
+          },
           {
             "name": "Hadoop AWS",
             "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
@@ -58,6 +69,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.6/spark-connect_2.12-3.5.6.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "91c16f5383ba28f96f01b92a8553433bc1df0f67"
+              },
+              "size": 14147901
+            }
+          },
           {
             "name": "Hadoop AWS",
             "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
@@ -97,6 +119,17 @@
               "relativePath": "jars/*"
             }
           },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "6a514b94478fbb86217162211991a17612d32a15"
+              },
+              "size": 17177871
+            }
+          },
           {
             "name": "Hadoop AWS",
             "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
index eef37fabc..0894aad29 100644
--- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
+++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json
@@ -18,6 +18,17 @@
               "depType": "archive",
               "relativePath": "jars/*"
             }
+          },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.7/spark-connect_2.12-3.5.7.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "11469f1eeb53c250b9bada24fff2f76b25c407eb"
+              },
+              "size": 14147901
+            }
           }
         ],
         "356": [
@@ -35,6 +46,17 @@
               "depType": "archive",
               "relativePath": "jars/*"
             }
+          },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.6/spark-connect_2.12-3.5.6.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "91c16f5383ba28f96f01b92a8553433bc1df0f67"
+              },
+              "size": 14147901
+            }
           }
         ],
         "350": [
@@ -52,6 +74,17 @@
               "depType": "archive",
               "relativePath": "jars/*"
             }
+          },
+          {
+            "name": "Apache Spark Connect",
+            "uri": "https://repo1.maven.org/maven2/org/apache/spark/spark-connect_2.12/3.5.0/spark-connect_2.12-3.5.0.jar",
+            "verification": {
+              "fileHash": {
+                "algorithm": "sha1",
+                "value": "6a514b94478fbb86217162211991a17612d32a15"
+              },
+              "size": 17177871
+            }
           }
         ],
         "342": [