NVIDIA · parthosa · Apr 27, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
+# Copyright (c) 2025-2026, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 # Note that all the output files in this list should be considered internal-use only.
 
 # <application-id>/
+#    ├── app_level_recommendation_signals.csv
 #    ├── application_information.csv
 #    ├── application_log_path_mapping.csv
 #    ├── cluster_information.json
@@ -372,6 +373,10 @@ reportDefinitions:
             dataType: Long
             description: >-
               TBD
+          - name: input_bytesRead_max
+            dataType: Long
+            description: >-
+              Maximum per-task input bytes read within the aggregation unit
           - name: input_recordsRead_sum
             dataType: Long
             description: >-
@@ -569,6 +574,10 @@ reportDefinitions:
             dataType: Long
             description: >-
               TBD
+          - name: input_bytesRead_max
+            dataType: Long
+            description: >-
+              Maximum per-task input bytes read within the aggregation unit
           - name: input_recordsRead_sum
             dataType: Long
             description: >-
@@ -782,6 +791,10 @@ reportDefinitions:
             dataType: Long
             description: >-
               TBD
+          - name: input_bytesRead_max
+            dataType: Long
+            description: >-
+              Maximum per-task input bytes read within the aggregation unit
           - name: input_recordsRead_sum
             dataType: Long
             description: >-
@@ -1426,3 +1439,27 @@ reportDefinitions:
         fileName: profile.log
         fileFormat: TXT
         scope: per-app
+      # AppLevelRecommendationSignalsProfileResult
+      - label: coreRawAppLevelRecommendationSignalsCSV
+        description: >-
+          Per-app derived signals that feed recommendation engines (AutoTuner, qualx,
+          etc.). Single row per app; wide layout, one column per signal.
+          GPU-only signals are 0 for qualification (CPU event logs).
+        fileName: app_level_recommendation_signals.csv
+        scope: per-app
+        columns:
+          - name: appId
+            dataType: String
+            description: >-
+              Application ID.
+          - name: numScanStagesWithGpuOom
+            dataType: Int
+            description: >-
+              Number of scan stages where failed tasks had GPU OOM errors
+              (GpuRetryOOM / GpuSplitAndRetryOOM / jni.GpuOOM). Profiling only.
+          - name: numGpuShuffleStagesWithContainerOom
+            dataType: Int
+            description: >-
+              Number of GPU shuffle stages (GpuShuffleExchangeExec) where YARN killed
+              the container due to container-level OOM (ExecutorLostFailure + exit 137).
+              Profiling only.
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 package com.nvidia.spark.rapids.tool.analysis
 
-import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult}
+import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult}
 
 /**
  * The result of the aggregation of the raw metrics. It contains the aggregated metrics for an
@@ -31,7 +31,6 @@ import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTa
  * @param sqlAggs           the aggregated Spark metrics for SQLs
  * @param ioAggs            lists the SQLs along their IO metrics
  * @param sqlDurAggs        the aggregated duration and CPU time for SQLs
- * @param maxTaskInputSizes a sequence of SQLMaxTaskInputSizes that contains the maximum input size
  * @param stageDiagnostics  the stage level Spark metrics for diagnostic purposes
  */
 case class AggRawMetricsResult(
@@ -41,5 +40,4 @@ case class AggRawMetricsResult(
     sqlAggs: Seq[SQLTaskAggMetricsProfileResult],
     ioAggs: Seq[IOAnalysisProfileResult],
     sqlDurAggs: Seq[SQLDurationExecutorTimeProfileResult],
-    maxTaskInputSizes: Seq[SQLMaxTaskInputSizes],
     stageDiagnostics: Seq[StageDiagnosticResult])
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
       sqlMetricsAgg,
       analysisObj.aggregateIOMetricsBySql(sqlMetricsAgg),
       analysisObj.aggregateDurationAndCPUTimeBySql(index),
-      Seq(analysisObj.maxTaskInputSizeBytesPerSQL(index)),
       analysisObj.aggregateDiagnosticMetricsByStage(index, sqlAnalyzer))
   }
 
@@ -67,7 +66,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
         agg1.sqlAggs ++ agg2.sqlAggs,
         agg1.ioAggs ++ agg2.ioAggs,
         agg1.sqlDurAggs ++ agg2.sqlDurAggs,
-        agg1.maxTaskInputSizes ++ agg2.maxTaskInputSizes,
         agg1.stageDiagnostics ++ agg2.stageDiagnostics)
     }
   }

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,6 +106,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
             perJobRec.executorDeserializeTimeSum,
             perJobRec.executorRunTimeSum,
             perJobRec.inputBytesReadSum,
+            perJobRec.inputBytesReadMax,
             perJobRec.inputRecordsReadSum,
             perJobRec.jvmGCTimeSum,
             perJobRec.memoryBytesSpilledSum,
@@ -203,6 +204,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
             preSqlRec.executorDeserializeTimeSum,
             preSqlRec.executorRunTimeSum,
             preSqlRec.inputBytesReadSum,
+            preSqlRec.inputBytesReadMax,
             preSqlRec.inputBytesReadAvg,
             preSqlRec.inputRecordsReadSum,
             preSqlRec.jvmGCTimeSum,
@@ -252,32 +254,6 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
     }.toSeq
   }
 
-  /**
-   * Find the maximum task input size
-   * @param index App index  (used by the profiler tool)
-   * @return a single SQLMaxTaskInputSizes record that contains the maximum value. If none, it will
-   *         be 0L
-   */
-  def maxTaskInputSizeBytesPerSQL(index: Int): SQLMaxTaskInputSizes = {
-    // TODO: We should keep maxInputSize as a field in the stageAggregate to avoid doing an
-    //       extra path on the tasks
-    val maxOfSqls = app.sqlIdToStages.map { case (_, stageIds) =>
-      // TODO: Should we only consider successful tasks?
-      val tasksInSQL = app.taskManager.getTasksByStageIds(stageIds)
-      if (tasksInSQL.isEmpty) {
-        0L
-      } else {
-        tasksInSQL.map(_.input_bytesRead).max
-      }
-    }
-    val maxVal = if (maxOfSqls.nonEmpty) {
-      maxOfSqls.max
-    } else {
-      0L
-    }
-    SQLMaxTaskInputSizes(app.appId, maxVal)
-  }
-
   /**
    * Aggregates the duration and CPU time (milliseconds) by SQL
    * @param index App index  (used by the profiler tool)
@@ -398,6 +374,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
         perStageRec.executorDeserializeTimeSum,
         perStageRec.executorRunTimeSum,
         perStageRec.inputBytesReadSum,
+        perStageRec.inputBytesReadMax,
         perStageRec.inputRecordsReadSum,
         perStageRec.jvmGCTimeSum,
         perStageRec.memoryBytesSpilledSum,

diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ class TaskMetricsAccumRec {
   var executorDeserializeTimeSum: Long = 0
   var executorRunTimeSum: Long = 0
   var inputBytesReadSum: Long = 0
+  var inputBytesReadMax: Long = Long.MinValue
   var inputRecordsReadSum: Long = 0
   var jvmGCTimeSum: Long = 0
   var memoryBytesSpilledSum: Long = 0
@@ -70,6 +71,7 @@ class TaskMetricsAccumRec {
   def resetFields(): Unit = {
     durationMax = 0
     durationMin = 0
+    inputBytesReadMax = 0
     peakExecutionMemoryMax = 0
     resultSizeMax = 0
   }
@@ -102,6 +104,7 @@ class TaskMetricsAccumRec {
     swWriteTimeSum += rec.sw_writeTime
     // Max fields
     durationMax = math.max(durationMax, rec.duration)
+    inputBytesReadMax = math.max(inputBytesReadMax, rec.input_bytesRead)
     peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemory)
     resultSizeMax = math.max(resultSizeMax, rec.resultSize)
     // Min Fields
@@ -136,6 +139,7 @@ class TaskMetricsAccumRec {
     swWriteTimeSum += rec.swWriteTimeSum
     // Max
     durationMax = math.max(durationMax, rec.durationMax)
+    inputBytesReadMax = math.max(inputBytesReadMax, rec.inputBytesReadMax)
     peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemoryMax)
     resultSizeMax = math.max(resultSizeMax, rec.resultSizeMax)
     // Min