From a3d9d75434b9f3d7d32c8f2bdcd3f5787075531a Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 8 Apr 2026 10:22:15 -0700 Subject: [PATCH 01/16] Enhance profiling metrics with new OOM detection Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 18 ++- .../profiling/ApplicationSummaryInfo.scala | 140 +++++++++++------- .../tool/profiling/CollectInformation.scala | 17 ++- .../rapids/tool/profiling/ProfileArgs.scala | 2 +- .../profiling/ProfileClassWarehouse.scala | 51 ++++++- .../rapids/tool/profiling/Profiler.scala | 32 ++-- .../spark/rapids/tool/tuning/AutoTuner.scala | 4 +- .../rapids/tool/views/OutHeaderRegistry.scala | 4 +- .../tool/views/QualRawReportGenerator.scala | 21 ++- .../tool/profiling/OomDetectionSuite.scala | 62 ++++++++ .../tool/tuning/BaseAutoTunerSuite.scala | 12 +- .../tool/tuning/ProfilingAutoTunerSuite.scala | 7 +- .../tuning/ProfilingAutoTunerSuiteV2.scala | 4 +- 13 files changed, 283 insertions(+), 91 deletions(-) create mode 100644 core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/OomDetectionSuite.scala diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index d9e4b1b6a..a89482665 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -106,6 +106,22 @@ reportDefinitions: dataType: Long description: >- TBD + - name: maxTaskInputBytesRead + dataType: Double + description: >- + Maximum per-task input bytes read across all SQLs + - name: maxColumnarExchangeDataSizeBytes + dataType: Long + description: >- + Maximum data size from ColumnarExchange metrics (profiling only) + - name: scanStagesWithGpuOom + dataType: String + description: >- + Comma-separated stage IDs of scan stages with GPU OOM (profiling only) + - name: shuffleStagesWithOom + dataType: String + description: >- + Comma-separated stage IDs of shuffle stages with OOM (profiling only) # AppLogPathProfileResults - label: coreRawApplicationLogPathMappingCSV description: >- diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index 783e2fc50..4adefb272 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,8 +90,8 @@ trait AppInfoReadMetrics { } trait AppInfoGpuOomCheck { - def hasScanStagesWithGpuOom: Boolean = false - def hasShuffleStagesWithOom: Boolean = false + def scanStagesWithGpuOom: Set[Long] = Set.empty + def shuffleStagesWithOom: Set[Long] = Set.empty } trait AppInfoColumnarExchangeMetrics { @@ -230,65 +230,112 @@ class SingleAppSummaryInfoProvider( } /** - * Check if there are any scan stages with failed tasks due to GPU OOM errors + * Returns stage IDs of scan stages with failed tasks due to GPU OOM errors * (GpuRetryOOM and GpuSplitAndRetryOOM). */ - override def hasScanStagesWithGpuOom: Boolean = { - // If the plugin is not enabled (i.e. non-GPU app), return false - if (!app.appInfo.exists(_.pluginEnabled)) { - return false + override def scanStagesWithGpuOom: Set[Long] = { + SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( + app.appInfo.exists(_.pluginEnabled), + app.failedTasks, app.stageMetrics, appInfo) + } + + /** + * Returns stage IDs of failed shuffle stages with OOM errors in the task's end reason. + * Note: This check is enabled only if the plugin is enabled (i.e. GPU app) and running on YARN. + */ + override def shuffleStagesWithOom: Set[Long] = { + SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( + app.appInfo.exists(_.pluginEnabled), + getSparkProperty("spark.master"), + app.failedStages, app.failedTasks) + } + + /** + * Get the maximum data size from ColumnarExchange metrics. + * This method searches through SQLPlan metrics to find all ColumnarExchange nodes + * with "data size" metrics and returns the maximum total value. + * + * @return Option[Long] containing the maximum data size in bytes, or None if no + * ColumnarExchange "data size" metrics are found + */ + override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = { + SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(app.sqlMetrics) + } + + override def getClassPathEntries: Map[String, String] = { + appInfo.classpathEntries + } +} + +object SingleAppSummaryInfoProvider { + /** + * Computes the set of scan stage IDs that had GPU OOM failures. + * This static method enables computing the result before ApplicationSummaryInfo is assembled. + */ + def computeScanStagesWithGpuOom( + pluginEnabled: Boolean, + failedTasks: Seq[FailedTaskProfileResults], + stageMetrics: Seq[AccumProfileResults], + appInfo: ApplicationInfo): Set[Long] = { + if (!pluginEnabled) { + return Set.empty } // Find stages with failed tasks due to GPU OOM errors - val failedStagesWithGpuOom = app.failedTasks.collect { - case task if SparkRapidsOomExceptions.gpuExceptionClassNames - .exists(task.endReason.contains) => task.stageId + val failedStagesWithGpuOom = failedTasks.collect { + case task if SparkRapidsOomExceptions.isGpuOom(task.endReason) => task.stageId.toLong } if (failedStagesWithGpuOom.isEmpty) { - return false + return Set.empty } // Calculate stageIds of scan stages (i.e. stages with 'scan time' metrics) - val scanStages = app.stageMetrics.collect { + val scanStages = stageMetrics.collect { case metric if IoMetrics.getIoMetricsHelper(appInfo).isScanTimeMetric(metric) => - metric.stageId + metric.stageId.toLong }.toSet if (scanStages.isEmpty) { - return false + return Set.empty } - // Check if any failed GPU OOM stage is also a scan stage - failedStagesWithGpuOom.exists(scanStages.contains) + // Return scan stages that also had GPU OOM failures + failedStagesWithGpuOom.filter(scanStages.contains).toSet } /** - * This method checks for failed shuffle stages with OOM errors in the task's end reason. - * Note: This check is enabled only if the plugin is enabled (i.e. GPU app) and running on YARN. - * """ + * Computes the set of shuffle stage IDs that had container OOM failures (YARN only). + * Detects ExecutorLostFailure with exit code 137 (SIGKILL from container memory enforcement). + * See: https://github.com/NVIDIA/spark-rapids-tools/issues/1566 + * This static method enables computing the result before ApplicationSummaryInfo is assembled. */ - override def hasShuffleStagesWithOom: Boolean = { - // If the plugin is not enabled (i.e. non-GPU app) or not running on YARN, return false - val sparkMaster = SparkMaster(getSparkProperty("spark.master")) - if (!app.appInfo.exists(_.pluginEnabled) || !sparkMaster.contains(Yarn)) { - return false + def computeShuffleStagesWithOom( + pluginEnabled: Boolean, + sparkMasterStr: Option[String], + failedStages: Seq[FailedStagesProfileResults], + failedTasks: Seq[FailedTaskProfileResults]): Set[Long] = { + if (!pluginEnabled || !SparkMaster(sparkMasterStr).contains(Yarn)) { + return Set.empty } // Get stage IDs of failed shuffle stages // Sample stage name: "submitShuffleJob$ at GpuShuffleExchangeExec.scala:53" - val failedStagesWithShuffle = app.failedStages.collect { + val failedStagesWithShuffle = failedStages.collect { case stage if stage.name.contains(SparkRapidsOomExceptions.gpuShuffleClassName) => - stage.stageId + stage.stageId.toLong }.toSet if (failedStagesWithShuffle.isEmpty) { - return false + return Set.empty } // scalastyle:off line.size.limit // Check if the failed task's end reason contains OOM errors - // Sample end reason for failed tasks on YARN: "ExecutorLostFailure (executor 2 exited caused by one of the running tasks) Reason: Container from a bad node: container_e02_17xxx on host: test-cluster-w-0. Exit status: 137" + // Sample end reason for failed tasks on YARN: + // "ExecutorLostFailure (executor 2 exited caused by one of the running tasks) + // Reason: Container from a bad node: container_e02_17xxx on host: test-cluster-w-0. + // Exit status: 137" // Reference: https://github.com/apache/spark/blob/master/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala // scalastyle:on line.size.limit // Regular expressions to identify OOM failures in task's end reason @@ -299,30 +346,23 @@ class SingleAppSummaryInfoProvider( s"Exit status: ${UnixExitCode.FORCE_KILLED}" ).map(_.r) - // Check if any failed task in shuffle stages have OOM failures - app.failedTasks.exists { task => - if (failedStagesWithShuffle.contains(task.stageId)) { - // Check if the task failed due to OOM - oomFailurePatterns.forall(p => p.findFirstIn(task.endReason).isDefined) - } else { - // Ignore if the failed task is not in a shuffle stage - false - } - } + // Return shuffle stages that had tasks with OOM failures + failedTasks.collect { + case task if failedStagesWithShuffle.contains(task.stageId.toLong) && + oomFailurePatterns.forall(p => p.findFirstIn(task.endReason).isDefined) => + task.stageId.toLong + }.toSet } /** - * Get the maximum data size from ColumnarExchange metrics. - * This method searches through SQLPlan metrics to find all ColumnarExchange nodes - * with "data size" metrics and returns the maximum total value. - * - * @return Option[Long] containing the maximum data size in bytes, or None if no - * ColumnarExchange "data size" metrics are found + * Computes the maximum data size from ColumnarExchange metrics. + * This static method enables computing the result before ApplicationSummaryInfo is assembled. */ - override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = { - val columnarExchangeDataSizesBytes = app.sqlMetrics.collect { + def computeMaxColumnarExchangeDataSizeBytes( + sqlMetrics: Seq[SQLAccumProfileResults]): Option[Long] = { + val columnarExchangeDataSizesBytes = sqlMetrics.collect { case metric if metric.nodeName.contains("ColumnarExchange") && - metric.name == "data size" => + metric.name == "data size" => metric.total } if (columnarExchangeDataSizesBytes.nonEmpty) { @@ -331,8 +371,4 @@ class SingleAppSummaryInfoProvider( None } } - - override def getClassPathEntries: Map[String, String] = { - appInfo.classpathEntries - } } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala index 1addbbb48..8b165733f 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,6 +37,21 @@ class CollectInformation(apps: Seq[ApplicationInfo]) extends Logging { ProfInformationView.getRawView(apps) } + // Extends AppInfoProfileResults with fields that require aggregate metrics or + // cross-referencing (e.g., failed tasks with stage metrics). Fields like + // totalCoreSeconds are already populated at the view layer in InformationView. + def getExtendedAppInfo( + maxTaskInputBytesRead: Double, + maxColumnarExchangeDataSizeBytes: Option[Long], + scanStagesWithGpuOom: Set[Long], + shuffleStagesWithOom: Set[Long]): Seq[AppInfoProfileResults] = { + getAppInfo.map(_.copy( + maxTaskInputBytesRead = maxTaskInputBytesRead, + maxColumnarExchangeDataSizeBytes = maxColumnarExchangeDataSizeBytes, + scanStagesWithGpuOom = scanStagesWithGpuOom, + shuffleStagesWithOom = shuffleStagesWithOom)) + } + def getAppLogPath: Seq[AppLogPathProfileResults] = { ProfLogPathView.getRawView(apps) } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala index 81a253ebf..a660c4ac6 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala @@ -89,7 +89,7 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* descr = "Number of thread to use for parallel processing. The default is the " + "number of cores on host divided by 4.") val csv: ScallopOption[Boolean] = - opt[Boolean](required = false, + opt[Boolean](required = false, default = Some(true), descr = "Output each table to a CSV file as well creating the summary text file.") val timeout: ScallopOption[Long] = opt[Long](required = false, diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 7bde04197..8a0422d0e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -463,7 +463,11 @@ case class AppInfoProfileResults( sparkRuntime: SparkRuntime.SparkRuntime, sparkVersion: String, pluginEnabled: Boolean, - totalCoreSeconds: Long) extends ProfileResult { + totalCoreSeconds: Long, + maxTaskInputBytesRead: Double = 0.0, + maxColumnarExchangeDataSizeBytes: Option[Long] = None, + scanStagesWithGpuOom: Set[Long] = Set.empty, + shuffleStagesWithOom: Set[Long] = Set.empty) extends ProfileResult { override def outputHeaders: Array[String] = { OutHeaderRegistry.outputHeaders("AppInfoProfileResults") } @@ -496,11 +500,23 @@ case class AppInfoProfileResults( } } + private def maxColumnarExchangeToStr: String = { + maxColumnarExchangeDataSizeBytes.map(_.toString).getOrElse("") + } + + private def stageIdsToStr(stageIds: Set[Long]): String = { + if (stageIds.isEmpty) "" else stageIds.toSeq.sorted.mkString(",") + } + override def convertToSeq(): Array[String] = { Array(appName, appIdToStr, attemptIdToStr, sparkUser, startTime.toString, endTimeToStr, durToStr, durationStr, sparkRuntime.toString, sparkVersion, pluginEnabled.toString, - totalCoreSeconds.toString) + totalCoreSeconds.toString, + f"$maxTaskInputBytesRead%.0f", + maxColumnarExchangeToStr, + stageIdsToStr(scanStagesWithGpuOom), + stageIdsToStr(shuffleStagesWithOom)) } override def convertToCSVSeq(): Array[String] = { @@ -512,7 +528,11 @@ case class AppInfoProfileResults( StringUtils.reformatCSVString(sparkRuntime.toString), StringUtils.reformatCSVString(sparkVersion), pluginEnabled.toString, - totalCoreSeconds.toString) + totalCoreSeconds.toString, + f"$maxTaskInputBytesRead%.0f", + maxColumnarExchangeToStr, + StringUtils.reformatCSVString(stageIdsToStr(scanStagesWithGpuOom)), + StringUtils.reformatCSVString(stageIdsToStr(shuffleStagesWithOom))) } } @@ -1499,15 +1519,32 @@ case class RecommendedCommentResult(comment: String) { override def toString: String = "- %s".format(comment) } +// scalastyle:off line.size.limit /** - * Helper object to store the list of SparkRapids OOM exceptions. + * Helper object to detect OOM exceptions from SparkRapids event logs. + * + * GPU OOM class names from spark-rapids-jni: + * - GpuOOM -> GpuRetryOOM, GpuSplitAndRetryOOM + * See: https://github.com/NVIDIA/spark-rapids-jni/blob/725cd64be2115cd072bf51d7d6c5281d6d08bf4f/src/main/cpp/src/SparkResourceAdaptorJni.cpp#L1313 + * See: https://github.com/NVIDIA/spark-rapids/blob/79922d62a1c5759963e969018322ad8e544629ff/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RmmRapidsRetryIterator.scala */ +// scalastyle:on line.size.limit object SparkRapidsOomExceptions { - val gpuExceptionClassNames: Set[String] = { - Set("GpuSplitAndRetryOOM", "GpuRetryOOM") - } + // Current JNI: GpuOOM -> GpuRetryOOM, GpuSplitAndRetryOOM + // Pre-24.02 JNI: jni.SplitAndRetryOOM, jni.RetryOOM (no Gpu prefix) + // Using "jni." prefix to avoid matching CpuSplitAndRetryOOM + val gpuExceptionClassNames: Set[String] = + Set("GpuSplitAndRetryOOM", "GpuRetryOOM", "GpuOOM", + "jni.SplitAndRetryOOM", "jni.RetryOOM") val gpuShuffleClassName: String = "GpuShuffleExchangeExec" + + /** + * Check if a failure reason indicates a GPU OOM error. + */ + def isGpuOom(failureReason: String): Boolean = { + gpuExceptionClassNames.exists(failureReason.contains) + } } /** diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 1534fbf2b..3b7bd878e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -293,19 +293,33 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea } } val analysis = RawMetricProfilerView.getAggMetrics(analyzedApps) - val maxTaskInputInfo = if (useAutoTuner) { - analysis.maxTaskInputSizes - } else { - Seq.empty - } + val maxTaskInputInfo = analysis.maxTaskInputSizes val sqlIdAlign = if (outputAlignedSQLIds) { collect.getSQLCleanAndAligned } else { Seq.empty } val endTime = System.currentTimeMillis() - val appInfo = collect.getAppInfo val sqlMetrics = collect.getSQLPlanMetrics + val stageMetrics = collect.getStageLevelMetrics + val failedTasks = healthCheck.getFailedTasks + val failedStages = healthCheck.getFailedStages + + // Compute AutoTuner inputs to enrich application_information.csv + val singleApp = analyzedApps.head + val pluginEnabled = singleApp.gpuMode + val maxTaskInput = analysis.maxTaskInputSizes.headOption + .map(_.maxTaskInputBytesRead).getOrElse(0.0) + val maxColumnarExchange = + SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(sqlMetrics) + val scanOomStages = SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( + pluginEnabled, failedTasks, stageMetrics, singleApp) + val shuffleOomStages = SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( + pluginEnabled, singleApp.sparkProperties.get("spark.master"), + failedStages, failedTasks) + + val appInfo = collect.getExtendedAppInfo( + maxTaskInput, maxColumnarExchange, scanOomStages, shuffleOomStages) logDebug(s"Time to collect Profiling Info [${appInfo.head.appId}]: ${endTime - startTime}.") val appInfoSummary = ApplicationSummaryInfo( appInfo = appInfo, @@ -315,14 +329,14 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea rapidsProps = collect.getRapidsProperties, rapidsJar = collect.getRapidsJARInfo, sqlMetrics = sqlMetrics, - stageMetrics = collect.getStageLevelMetrics, + stageMetrics = stageMetrics, jobAggMetrics = analysis.jobAggs, stageAggMetrics = analysis.stageAggs, sqlTaskAggMetrics = analysis.sqlAggs, durAndCpuMet = analysis.sqlDurAggs, skewInfo = analysis.taskShuffleSkew, - failedTasks = healthCheck.getFailedTasks, - failedStages = healthCheck.getFailedStages, + failedTasks = failedTasks, + failedStages = failedStages, failedJobs = healthCheck.getFailedJobs, removedBMs = healthCheck.getRemovedBlockManager, removedExecutors = healthCheck.getRemovedExecutors, diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala index 90800289e..1eb8dd56e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala @@ -2177,7 +2177,7 @@ class ProfilingAutoTuner( // First, calculate the recommendation based on input sizes val calculatedValueFromInputSize = super.calculateMaxPartitionBytesInMB(maxPartitionBytes) getPropertyValue("spark.sql.files.maxPartitionBytes") match { - case Some(currentValue) if appInfoProvider.hasScanStagesWithGpuOom => + case Some(currentValue) if appInfoProvider.scanStagesWithGpuOom.nonEmpty => // GPU OOM detected. We may want to reduce max partition size. val halvedValue = StringUtils.convertToMB(currentValue, Some(ByteUnit.BYTE)) / 2 // Choose the minimum between the calculated value and half of the current value. @@ -2198,7 +2198,7 @@ class ProfilingAutoTuner( */ override def recommendShufflePartitionsInternal(): Int = { val calculatedValue = super.recommendShufflePartitionsInternal() - if (appInfoProvider.hasShuffleStagesWithOom) { + if (appInfoProvider.shuffleStagesWithOom.nonEmpty) { // Shuffle Stages with Task OOM detected. We may want to increase shuffle partitions. val recShufflePartitions = shufflePartitionValue * configProvider.getEntry("SHUFFLE_PARTITION_MULTIPLIER").getDefault.toInt diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index 795de4c6a..4a0bd37be 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -69,7 +69,9 @@ object OutHeaderRegistry { Array("sqlID", "nodeID", "nodeName", "nodeDescription", "reason"), "AppInfoProfileResults" -> Array("appName", "appId", "attemptId", "sparkUser", "startTime", "endTime", "duration", - "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds"), + "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds", + "maxTaskInputBytesRead", "maxColumnarExchangeDataSizeBytes", + "scanStagesWithGpuOom", "shuffleStagesWithOom"), "AppLogPathProfileResults" -> Array("appName", "appId", "eventLogPath"), "FailedTaskProfileResults" -> diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index 604fbfa7b..a9e146ed6 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,9 +71,19 @@ object QualRawReportGenerator extends Logging { val pWriter = new ProfileOutputWriter(metricsDirectory, "profile", 10000000, outputCSV = true) try { + // Compute aggregate metrics early so maxTaskInputBytesRead is available for + // application_information.csv enrichment + val aggRawMetrics = QualSparkMetricsAggregator + .getAggRawMetrics(app, sqlAnalyzer = Some(sqlPlanAnalyzer)) + val maxTaskInput = aggRawMetrics.maxTaskInputSizes.headOption + .map(_.maxTaskInputBytesRead).getOrElse(0.0) + pWriter.writeText("### A. Information Collected ###") - pWriter.writeTable( - QualInformationView.getLabel, QualInformationView.getRawView(Seq(app))) + // Extend application info with maxTaskInputBytesRead from aggregate metrics. + // OOM and ColumnarExchange columns are empty for qualification (CPU event logs). + val extendedAppInfo = QualInformationView.getRawView(Seq(app)).map( + _.copy(maxTaskInputBytesRead = maxTaskInput)) + pWriter.writeTable(QualInformationView.getLabel, extendedAppInfo) pWriter.writeTable(QualLogPathView.getLabel, QualLogPathView.getRawView(Seq(app))) val sqlPlanMetricsResults = generateSQLProcessingView(pWriter, sqlPlanAnalyzer) pWriter.writeJsonL( @@ -96,9 +106,8 @@ object QualRawReportGenerator extends Logging { SystemQualPropertiesView.getRawView(Seq(app)), Some(SystemQualPropertiesView.getDescription)) pWriter.writeText("\n### B. Analysis ###\n") - constructLabelsMaps(QualSparkMetricsAggregator - .getAggRawMetrics( - app, sqlAnalyzer = Some(sqlPlanAnalyzer))).foreach { case (label, metrics) => + // Reuse already-computed aggRawMetrics instead of computing again + constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } pWriter.writeText("\n### C. Health Check###\n") diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/OomDetectionSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/OomDetectionSuite.scala new file mode 100644 index 000000000..c5f01f22a --- /dev/null +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/OomDetectionSuite.scala @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.tool.profiling + +import org.scalatest.funsuite.AnyFunSuite + +class OomDetectionSuite extends AnyFunSuite { + + // (description, failureReason, expectedIsGpuOom) + private val gpuOomTestCases = Seq( + ("GpuSplitAndRetryOOM", + "com.nvidia.spark.rapids.jni.GpuSplitAndRetryOOM: " + + "GPU OutOfMemory: a batch of 1 cannot be split!", + true), + ("GpuRetryOOM", + "com.nvidia.spark.rapids.jni.GpuRetryOOM: GPU OutOfMemory", + true), + ("GpuOOM base class", + "com.nvidia.spark.rapids.jni.GpuOOM: GPU OutOfMemory", + true), + ("pre-24.02 jni.SplitAndRetryOOM (no Gpu prefix)", + "com.nvidia.spark.rapids.jni.SplitAndRetryOOM: " + + "GPU OutOfMemory: a batch of 1 cannot be split!", + true), + ("pre-24.02 jni.RetryOOM (no Gpu prefix)", + "com.nvidia.spark.rapids.jni.RetryOOM: GPU OutOfMemory", + true), + ("CpuSplitAndRetryOOM should not match", + "com.nvidia.spark.rapids.jni.CpuSplitAndRetryOOM: " + + "CPU OutOfMemory", + false), + ("CpuRetryOOM should not match", + "com.nvidia.spark.rapids.jni.CpuRetryOOM: CPU OutOfMemory", + false), + ("NullPointerException should not match", + "java.lang.NullPointerException: some error", + false), + ("ExecutorLostFailure should not match", + "ExecutorLostFailure (executor 5 exited) Exit status: 137", + false) + ) + + gpuOomTestCases.foreach { case (desc, reason, expected) => + test(s"isGpuOom: $desc") { + assert(SparkRapidsOomExceptions.isGpuOom(reason) === expected) + } + } +} diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala index 9279625f4..ceffc08c7 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala @@ -49,8 +49,8 @@ class AppInfoProviderMockTest(val maxInput: Double, val meanShuffleRead: Double, val shuffleStagesWithPosSpilling: Set[Long], val shuffleSkewStages: Set[Long], - val scanStagesWithGpuOom: Boolean, - val shuffleStagesWithOom: Boolean, + val scanStagesWithGpuOomSet: Set[Long], + val shuffleStagesWithOomSet: Set[Long], val maxColumnarExchangeDataSizeBytes: Option[Long] = None) extends BaseProfilingAppSummaryInfoProvider { override def isAppInfoAvailable = true @@ -69,8 +69,8 @@ class AppInfoProviderMockTest(val maxInput: Double, override def getRedundantReadSize: Long = redundantReadSize override def getShuffleStagesWithPosSpilling: Set[Long] = shuffleStagesWithPosSpilling override def getShuffleSkewStages: Set[Long] = shuffleSkewStages - override def hasScanStagesWithGpuOom: Boolean = scanStagesWithGpuOom - override def hasShuffleStagesWithOom: Boolean = shuffleStagesWithOom + override def scanStagesWithGpuOom: Set[Long] = scanStagesWithGpuOomSet + override def shuffleStagesWithOom: Set[Long] = shuffleStagesWithOomSet override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = maxColumnarExchangeDataSizeBytes /** @@ -135,8 +135,8 @@ abstract class BaseAutoTunerSuite extends AnyFunSuite with BeforeAndAfterEach meanShuffleRead: Double = 0.0, shuffleStagesWithPosSpilling: Set[Long] = Set(), shuffleSkewStages: Set[Long] = Set(), - scanStagesWithGpuOom: Boolean = false, - shuffleStagesWithOom: Boolean = false, + scanStagesWithGpuOom: Set[Long] = Set(), + shuffleStagesWithOom: Set[Long] = Set(), maxColumnarExchangeDataSizeBytes: Option[Long] = None): AppInfoProviderMockTest = { new AppInfoProviderMockTest(maxInput, spilledMetrics, jvmGCFractions, propsFromLog, sparkVersion, rapidsJars, distinctLocationPct, redundantReadSize, meanInput, meanShuffleRead, diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala index 140399825..9ed73e57d 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala @@ -3136,7 +3136,8 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { "spark.plugins" -> "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.concurrentGpuTasks" -> "4") val infoProvider = getMockInfoProvider(0, Seq(0), Seq(0.0), - logEventsProps, Some(testSparkVersion), scanStagesWithGpuOom = hasGpuOOm) + logEventsProps, Some(testSparkVersion), + scanStagesWithGpuOom = if (hasGpuOOm) Set(1L) else Set.empty) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC) // Configure cluster info using Platform's existing method @@ -3288,7 +3289,7 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { "spark.plugins" -> "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.concurrentGpuTasks" -> "4") val infoProvider = getMockInfoProvider(0, Seq(0), Seq(0.0), - logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = true) + logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = Set(1L)) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC) // Configure cluster info using Platform's existing method @@ -3376,7 +3377,7 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { "spark.plugins" -> "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.concurrentGpuTasks" -> "4") val infoProvider = getMockInfoProvider(0, Seq(0), Seq(0.0), - logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = true, + logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = Set(1L), meanInput = 50000, meanShuffleRead = 80000) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala index 834b95480..db47aefe1 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala @@ -1692,8 +1692,8 @@ class ProfilingAutoTunerSuiteV2 extends ProfilingAutoTunerSuiteBase { redundantReadSize = 0L, shuffleStagesWithPosSpilling = Set.empty, shuffleSkewStages = Set.empty, - scanStagesWithGpuOom = false, - shuffleStagesWithOom = false, + scanStagesWithGpuOom = Set.empty, + shuffleStagesWithOom = Set.empty, maxColumnarExchangeDataSizeBytes = Some(1000L * 1024 * 1024 * 1024) // 1000GB ) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC, Some(targetClusterInfo)) From 091ae29fa1cb7266fac88a930648a23ad7325544 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 8 Apr 2026 10:23:33 -0700 Subject: [PATCH 02/16] Update copyright years in ProfileArgs.scala and OutHeaderRegistry.scala to reflect 2026 --- .../com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala | 2 +- .../com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala index a660c4ac6..076a50dd3 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index 4a0bd37be..cf9b0e2bb 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From e8a6830603bd2acdba0f07bdfb0169f3c313e610 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 8 Apr 2026 14:58:38 -0700 Subject: [PATCH 03/16] Update data types and improve OOM error handling in profiling metrics - Changed data type of `maxTaskInputBytesRead` from Double to Long in `coreRawMetricsReport.yaml`. - Refactored methods in `ApplicationSummaryInfo.scala` to read pre-computed values for GPU OOM error handling, enhancing performance and reducing duplicate computations. - Updated comments in `ProfileClassWarehouse.scala` for clarity on GPU OOM exception handling. These changes aim to improve the accuracy and efficiency of profiling metrics related to GPU memory management. --- .../configs/reports/coreRawMetricsReport.yaml | 2 +- .../profiling/ApplicationSummaryInfo.scala | 26 +++++++------------ .../profiling/ProfileClassWarehouse.scala | 5 ++-- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index a89482665..eee4a919a 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -107,7 +107,7 @@ reportDefinitions: description: >- TBD - name: maxTaskInputBytesRead - dataType: Double + dataType: Long description: >- Maximum per-task input bytes read across all SQLs - name: maxColumnarExchangeDataSizeBytes diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index 4adefb272..8be679802 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -230,36 +230,28 @@ class SingleAppSummaryInfoProvider( } /** - * Returns stage IDs of scan stages with failed tasks due to GPU OOM errors - * (GpuRetryOOM and GpuSplitAndRetryOOM). + * Returns stage IDs of scan stages with failed tasks due to GPU OOM errors. + * Reads from the pre-computed values in AppInfoProfileResults to avoid + * duplicate computation (Profiler.processApp already computes these). */ override def scanStagesWithGpuOom: Set[Long] = { - SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( - app.appInfo.exists(_.pluginEnabled), - app.failedTasks, app.stageMetrics, appInfo) + app.appInfo.headOption.map(_.scanStagesWithGpuOom).getOrElse(Set.empty) } /** * Returns stage IDs of failed shuffle stages with OOM errors in the task's end reason. - * Note: This check is enabled only if the plugin is enabled (i.e. GPU app) and running on YARN. + * Reads from the pre-computed values in AppInfoProfileResults. */ override def shuffleStagesWithOom: Set[Long] = { - SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( - app.appInfo.exists(_.pluginEnabled), - getSparkProperty("spark.master"), - app.failedStages, app.failedTasks) + app.appInfo.headOption.map(_.shuffleStagesWithOom).getOrElse(Set.empty) } /** - * Get the maximum data size from ColumnarExchange metrics. - * This method searches through SQLPlan metrics to find all ColumnarExchange nodes - * with "data size" metrics and returns the maximum total value. - * - * @return Option[Long] containing the maximum data size in bytes, or None if no - * ColumnarExchange "data size" metrics are found + * Returns the maximum data size from ColumnarExchange metrics. + * Reads from the pre-computed value in AppInfoProfileResults. */ override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = { - SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(app.sqlMetrics) + app.appInfo.headOption.flatMap(_.maxColumnarExchangeDataSizeBytes) } override def getClassPathEntries: Map[String, String] = { diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 8a0422d0e..c990c2b88 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -1532,9 +1532,10 @@ case class RecommendedCommentResult(comment: String) { object SparkRapidsOomExceptions { // Current JNI: GpuOOM -> GpuRetryOOM, GpuSplitAndRetryOOM // Pre-24.02 JNI: jni.SplitAndRetryOOM, jni.RetryOOM (no Gpu prefix) - // Using "jni." prefix to avoid matching CpuSplitAndRetryOOM + // Using "jni." prefix to avoid matching CpuSplitAndRetryOOM / CpuRetryOOM + // Using "jni.GpuOOM" (anchored) for the base class to avoid partial matches val gpuExceptionClassNames: Set[String] = - Set("GpuSplitAndRetryOOM", "GpuRetryOOM", "GpuOOM", + Set("GpuSplitAndRetryOOM", "GpuRetryOOM", "jni.GpuOOM", "jni.SplitAndRetryOOM", "jni.RetryOOM") val gpuShuffleClassName: String = "GpuShuffleExchangeExec" From 412b67984a98f15ef8bddfd7660b627c57d7c489 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Thu, 16 Apr 2026 07:11:06 -0700 Subject: [PATCH 04/16] Extract tuning metrics into separate app_tuning_metrics.csv Move 4 AutoTuner input fields (maxTaskInputBytesRead, maxColumnarExchangeDataSizeBytes, scanStagesWithGpuOom, shuffleStagesWithOom) from application_information.csv into a new app_tuning_metrics.csv file. These are tuning-specific signals that don't belong in the core application info table. Also addresses review feedback: - Anchor GpuOOM pattern as jni.GpuOOM to avoid partial matches - Change maxTaskInputBytesRead YAML dataType from Double to Long Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 43 ++++++----- .../profiling/ApplicationSummaryInfo.scala | 27 +++---- .../tool/profiling/CollectInformation.scala | 15 ---- .../profiling/ProfileClassWarehouse.scala | 71 +++++++++++++------ .../rapids/tool/profiling/Profiler.scala | 15 ++-- .../rapids/tool/views/OutHeaderRegistry.scala | 5 +- .../tool/views/QualRawReportGenerator.scala | 15 ++-- .../spark/rapids/tool/views/package.scala | 1 + .../tool/profiling/ApplicationInfoSuite.scala | 8 +-- 9 files changed, 111 insertions(+), 89 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index eee4a919a..16d35afed 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -106,22 +106,6 @@ reportDefinitions: dataType: Long description: >- TBD - - name: maxTaskInputBytesRead - dataType: Long - description: >- - Maximum per-task input bytes read across all SQLs - - name: maxColumnarExchangeDataSizeBytes - dataType: Long - description: >- - Maximum data size from ColumnarExchange metrics (profiling only) - - name: scanStagesWithGpuOom - dataType: String - description: >- - Comma-separated stage IDs of scan stages with GPU OOM (profiling only) - - name: shuffleStagesWithOom - dataType: String - description: >- - Comma-separated stage IDs of shuffle stages with OOM (profiling only) # AppLogPathProfileResults - label: coreRawApplicationLogPathMappingCSV description: >- @@ -1442,3 +1426,30 @@ reportDefinitions: fileName: profile.log fileFormat: TXT scope: per-app + # AppTuningMetricsProfileResult + - label: coreRawAppTuningMetricsCSV + description: >- + Per-app metrics used by the AutoTuner for partition sizing and cluster sizing. + fileName: app_tuning_metrics.csv + scope: per-app + columns: + - name: appId + dataType: String + description: >- + Application ID + - name: maxTaskInputBytesRead + dataType: Long + description: >- + Maximum per-task input bytes read across all SQLs + - name: maxColumnarExchangeDataSizeBytes + dataType: Long + description: >- + Maximum data size from ColumnarExchange metrics (profiling only) + - name: scanStagesWithGpuOom + dataType: String + description: >- + Comma-separated stage IDs of scan stages with GPU OOM (profiling only) + - name: shuffleStagesWithOom + dataType: String + description: >- + Comma-separated stage IDs of shuffle stages with OOM (profiling only) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index 8be679802..3c50ee477 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -54,7 +54,8 @@ case class ApplicationSummaryInfo( sqlCleanedAlignedIds: Seq[SQLCleanAndAlignIdsProfileResult], sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent], writeOpsInfo: Seq[WriteOpProfileResult], - sqlPlanInfo: Seq[SQLPlanInfoProfileResult]) + sqlPlanInfo: Seq[SQLPlanInfoProfileResult], + appTuningMetrics: Seq[AppTuningMetricsProfileResult] = Seq.empty) trait AppInfoPropertyGetter { // returns all the properties (i.e., spark) @@ -229,29 +230,21 @@ class SingleAppSummaryInfoProvider( } } - /** - * Returns stage IDs of scan stages with failed tasks due to GPU OOM errors. - * Reads from the pre-computed values in AppInfoProfileResults to avoid - * duplicate computation (Profiler.processApp already computes these). - */ override def scanStagesWithGpuOom: Set[Long] = { - app.appInfo.headOption.map(_.scanStagesWithGpuOom).getOrElse(Set.empty) + SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( + app.appInfo.exists(_.pluginEnabled), + app.failedTasks, app.stageMetrics, appInfo) } - /** - * Returns stage IDs of failed shuffle stages with OOM errors in the task's end reason. - * Reads from the pre-computed values in AppInfoProfileResults. - */ override def shuffleStagesWithOom: Set[Long] = { - app.appInfo.headOption.map(_.shuffleStagesWithOom).getOrElse(Set.empty) + SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( + app.appInfo.exists(_.pluginEnabled), + getSparkProperty("spark.master"), + app.failedStages, app.failedTasks) } - /** - * Returns the maximum data size from ColumnarExchange metrics. - * Reads from the pre-computed value in AppInfoProfileResults. - */ override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = { - app.appInfo.headOption.flatMap(_.maxColumnarExchangeDataSizeBytes) + SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(app.sqlMetrics) } override def getClassPathEntries: Map[String, String] = { diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala index 8b165733f..e04a74ea0 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/CollectInformation.scala @@ -37,21 +37,6 @@ class CollectInformation(apps: Seq[ApplicationInfo]) extends Logging { ProfInformationView.getRawView(apps) } - // Extends AppInfoProfileResults with fields that require aggregate metrics or - // cross-referencing (e.g., failed tasks with stage metrics). Fields like - // totalCoreSeconds are already populated at the view layer in InformationView. - def getExtendedAppInfo( - maxTaskInputBytesRead: Double, - maxColumnarExchangeDataSizeBytes: Option[Long], - scanStagesWithGpuOom: Set[Long], - shuffleStagesWithOom: Set[Long]): Seq[AppInfoProfileResults] = { - getAppInfo.map(_.copy( - maxTaskInputBytesRead = maxTaskInputBytesRead, - maxColumnarExchangeDataSizeBytes = maxColumnarExchangeDataSizeBytes, - scanStagesWithGpuOom = scanStagesWithGpuOom, - shuffleStagesWithOom = shuffleStagesWithOom)) - } - def getAppLogPath: Seq[AppLogPathProfileResults] = { ProfLogPathView.getRawView(apps) } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index c990c2b88..7079220af 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -463,11 +463,7 @@ case class AppInfoProfileResults( sparkRuntime: SparkRuntime.SparkRuntime, sparkVersion: String, pluginEnabled: Boolean, - totalCoreSeconds: Long, - maxTaskInputBytesRead: Double = 0.0, - maxColumnarExchangeDataSizeBytes: Option[Long] = None, - scanStagesWithGpuOom: Set[Long] = Set.empty, - shuffleStagesWithOom: Set[Long] = Set.empty) extends ProfileResult { + totalCoreSeconds: Long) extends ProfileResult { override def outputHeaders: Array[String] = { OutHeaderRegistry.outputHeaders("AppInfoProfileResults") } @@ -500,23 +496,11 @@ case class AppInfoProfileResults( } } - private def maxColumnarExchangeToStr: String = { - maxColumnarExchangeDataSizeBytes.map(_.toString).getOrElse("") - } - - private def stageIdsToStr(stageIds: Set[Long]): String = { - if (stageIds.isEmpty) "" else stageIds.toSeq.sorted.mkString(",") - } - override def convertToSeq(): Array[String] = { Array(appName, appIdToStr, attemptIdToStr, sparkUser, startTime.toString, endTimeToStr, durToStr, durationStr, sparkRuntime.toString, sparkVersion, pluginEnabled.toString, - totalCoreSeconds.toString, - f"$maxTaskInputBytesRead%.0f", - maxColumnarExchangeToStr, - stageIdsToStr(scanStagesWithGpuOom), - stageIdsToStr(shuffleStagesWithOom)) + totalCoreSeconds.toString) } override def convertToCSVSeq(): Array[String] = { @@ -528,11 +512,52 @@ case class AppInfoProfileResults( StringUtils.reformatCSVString(sparkRuntime.toString), StringUtils.reformatCSVString(sparkVersion), pluginEnabled.toString, - totalCoreSeconds.toString, - f"$maxTaskInputBytesRead%.0f", - maxColumnarExchangeToStr, - StringUtils.reformatCSVString(stageIdsToStr(scanStagesWithGpuOom)), - StringUtils.reformatCSVString(stageIdsToStr(shuffleStagesWithOom))) + totalCoreSeconds.toString) + } +} + +case class AppTuningMetricsProfileResult( + appId: String, + maxTaskInputBytesRead: String, + maxColumnarExchangeDataSizeBytes: String, + scanStagesWithGpuOom: String, + shuffleStagesWithOom: String) extends ProfileResult { + override def outputHeaders: Array[String] = { + OutHeaderRegistry.outputHeaders("AppTuningMetricsProfileResult") + } + + override def convertToSeq(): Array[String] = { + Array(appId, maxTaskInputBytesRead, maxColumnarExchangeDataSizeBytes, + scanStagesWithGpuOom, shuffleStagesWithOom) + } + + override def convertToCSVSeq(): Array[String] = { + Array(StringUtils.reformatCSVString(appId), + maxTaskInputBytesRead, + maxColumnarExchangeDataSizeBytes, + StringUtils.reformatCSVString(scanStagesWithGpuOom), + StringUtils.reformatCSVString(shuffleStagesWithOom)) + } +} + +object AppTuningMetricsProfileResult { + private def stageIdsToStr(stageIds: Set[Long]): String = { + if (stageIds.isEmpty) "" else stageIds.toSeq.sorted.mkString(",") + } + + def apply( + appId: String, + maxTaskInputBytesRead: Double, + maxColumnarExchangeDataSizeBytes: Option[Long], + scanStagesWithGpuOom: Set[Long], + shuffleStagesWithOom: Set[Long]): AppTuningMetricsProfileResult = { + AppTuningMetricsProfileResult( + appId = appId, + maxTaskInputBytesRead = f"$maxTaskInputBytesRead%.0f", + maxColumnarExchangeDataSizeBytes = + maxColumnarExchangeDataSizeBytes.map(_.toString).getOrElse(""), + scanStagesWithGpuOom = stageIdsToStr(scanStagesWithGpuOom), + shuffleStagesWithOom = stageIdsToStr(shuffleStagesWithOom)) } } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 3b7bd878e..c794b9703 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -305,7 +305,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val failedTasks = healthCheck.getFailedTasks val failedStages = healthCheck.getFailedStages - // Compute AutoTuner inputs to enrich application_information.csv + // Compute AutoTuner inputs for app_tuning_metrics.csv val singleApp = analyzedApps.head val pluginEnabled = singleApp.gpuMode val maxTaskInput = analysis.maxTaskInputSizes.headOption @@ -318,9 +318,12 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea pluginEnabled, singleApp.sparkProperties.get("spark.master"), failedStages, failedTasks) - val appInfo = collect.getExtendedAppInfo( - maxTaskInput, maxColumnarExchange, scanOomStages, shuffleOomStages) - logDebug(s"Time to collect Profiling Info [${appInfo.head.appId}]: ${endTime - startTime}.") + val appInfo = collect.getAppInfo + val appId = appInfo.headOption.flatMap(_.appId).getOrElse("") + val tuningMetrics = Seq(AppTuningMetricsProfileResult( + appId, maxTaskInput, maxColumnarExchange, scanOomStages, shuffleOomStages)) + + logDebug(s"Time to collect Profiling Info [$appId]: ${endTime - startTime}.") val appInfoSummary = ApplicationSummaryInfo( appInfo = appInfo, dsInfo = collect.getDataSourceInfo(sqlMetrics), @@ -351,7 +354,8 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea sqlCleanedAlignedIds = sqlIdAlign, sparkRapidsBuildInfo = collect.getSparkRapidsInfo, writeOpsInfo = collect.getWriteOperationInfo, - sqlPlanInfo = collect.getSQLPlanInfoTruncated) + sqlPlanInfo = collect.getSQLPlanInfoTruncated, + appTuningMetrics = tuningMetrics) (appInfoSummary, DiagnosticSummaryInfo(analysis.stageDiagnostics, collect.getIODiagnosticMetrics)) } @@ -424,6 +428,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea // writeOps are generated in only CSV format profileOutputWriter.writeCSVTable(ProfWriteOpsView.getLabel, app.writeOpsInfo) profileOutputWriter.writeCSVTable(TASK_SHUFFLE_SKEW, app.skewInfo) + profileOutputWriter.writeCSVTable(APP_TUNING_METRICS, app.appTuningMetrics) profileOutputWriter.writeText("\n### C. Health Check###\n") profileOutputWriter.writeCSVTable(ProfFailedTaskView.getLabel, app.failedTasks) profileOutputWriter.writeTable(ProfFailedStageView.getLabel, app.failedStages) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index cf9b0e2bb..9e7c48408 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -69,8 +69,9 @@ object OutHeaderRegistry { Array("sqlID", "nodeID", "nodeName", "nodeDescription", "reason"), "AppInfoProfileResults" -> Array("appName", "appId", "attemptId", "sparkUser", "startTime", "endTime", "duration", - "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds", - "maxTaskInputBytesRead", "maxColumnarExchangeDataSizeBytes", + "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds"), + "AppTuningMetricsProfileResult" -> + Array("appId", "maxTaskInputBytesRead", "maxColumnarExchangeDataSizeBytes", "scanStagesWithGpuOom", "shuffleStagesWithOom"), "AppLogPathProfileResults" -> Array("appName", "appId", "eventLogPath"), diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index a9e146ed6..3f2a4cd22 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids.tool.views import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator} -import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults} +import com.nvidia.spark.rapids.tool.profiling.{AppTuningMetricsProfileResult, DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults} import org.apache.spark.internal.Logging import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo @@ -72,18 +72,15 @@ object QualRawReportGenerator extends Logging { new ProfileOutputWriter(metricsDirectory, "profile", 10000000, outputCSV = true) try { // Compute aggregate metrics early so maxTaskInputBytesRead is available for - // application_information.csv enrichment + // app_tuning_metrics.csv val aggRawMetrics = QualSparkMetricsAggregator .getAggRawMetrics(app, sqlAnalyzer = Some(sqlPlanAnalyzer)) val maxTaskInput = aggRawMetrics.maxTaskInputSizes.headOption .map(_.maxTaskInputBytesRead).getOrElse(0.0) pWriter.writeText("### A. Information Collected ###") - // Extend application info with maxTaskInputBytesRead from aggregate metrics. - // OOM and ColumnarExchange columns are empty for qualification (CPU event logs). - val extendedAppInfo = QualInformationView.getRawView(Seq(app)).map( - _.copy(maxTaskInputBytesRead = maxTaskInput)) - pWriter.writeTable(QualInformationView.getLabel, extendedAppInfo) + pWriter.writeTable( + QualInformationView.getLabel, QualInformationView.getRawView(Seq(app))) pWriter.writeTable(QualLogPathView.getLabel, QualLogPathView.getRawView(Seq(app))) val sqlPlanMetricsResults = generateSQLProcessingView(pWriter, sqlPlanAnalyzer) pWriter.writeJsonL( @@ -110,6 +107,10 @@ object QualRawReportGenerator extends Logging { constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } + // Write tuning metrics (GPU-only fields are empty for qualification) + val tuningMetrics = Seq(AppTuningMetricsProfileResult( + app.appId, maxTaskInput, Option.empty[Long], Set.empty[Long], Set.empty[Long])) + pWriter.writeCSVTable(APP_TUNING_METRICS, tuningMetrics) pWriter.writeText("\n### C. Health Check###\n") pWriter.writeCSVTable(QualFailedTaskView.getLabel, QualFailedTaskView.getRawView(Seq(app))) pWriter.writeTable( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala index bab4b8a2b..60a5f28cf 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala @@ -29,6 +29,7 @@ package object views { val SQL_DUR_LABEL = "SQL Duration and Executor CPU Time Percent" val SQL_MAX_INPUT_SIZE = "SQL Max Task Input Size" val STAGE_DIAGNOSTICS_LABEL = "Stage Level Diagnostic Metrics" + val APP_TUNING_METRICS = "App Tuning Metrics" val CLUSTER_INFORMATION_LABEL = "Cluster Information" val AGG_DESCRIPTION = Map( diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationInfoSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationInfoSuite.scala index 8af32d73c..60e2572cb 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationInfoSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationInfoSuite.scala @@ -901,22 +901,22 @@ class ApplicationInfoSuite extends AnyFunSuite with Logging { ("failures without diagnostic views", s"$logDir/tasks_executors_fail_compressed_eventlog.zstd", "application_1603128018386_7846", - 20, + 21, false), ("failures with diagnostic views", s"$logDir/tasks_executors_fail_compressed_eventlog.zstd", "application_1603128018386_7846", - 22, + 23, true), ("gpu without diagnostic views", s"$qualLogDir/udf_dataset_eventlog", "local-1651188809790", - 16, + 17, false), ("gpu with diagnostic views", s"$qualLogDir/udf_dataset_eventlog", "local-1651188809790", - 18, + 19, true) ) From 7531a6bd2bdedac40a1adbd98bac2313668f2c85 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Tue, 21 Apr 2026 22:32:36 -0700 Subject: [PATCH 05/16] Rename output file to application_tuning_metrics.csv, default values to 0 - Rename file from app_tuning_metrics.csv to application_tuning_metrics.csv (constant APP_TUNING_METRICS -> APPLICATION_TUNING_METRICS). - Change maxTaskInputBytesRead and maxColumnarExchangeDataSizeBytes in AppTuningMetricsProfileResult to Long with default 0, so missing values render as "0" symmetrically instead of mixing empty strings and numbers. - Add application_tuning_metrics.csv to the per-app output tree comment in coreRawMetricsReport.yaml. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 3 ++- .../rapids/tool/profiling/ProfileClassWarehouse.scala | 9 ++++----- .../nvidia/spark/rapids/tool/profiling/Profiler.scala | 9 +++++---- .../rapids/tool/views/QualRawReportGenerator.scala | 10 +++++----- .../com/nvidia/spark/rapids/tool/views/package.scala | 2 +- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index 16d35afed..d6ebb0765 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -18,6 +18,7 @@ # / # ├── application_information.csv # ├── application_log_path_mapping.csv +# ├── application_tuning_metrics.csv # ├── cluster_information.json # ├── data_source_information.csv # ├── executor_information.csv @@ -1430,7 +1431,7 @@ reportDefinitions: - label: coreRawAppTuningMetricsCSV description: >- Per-app metrics used by the AutoTuner for partition sizing and cluster sizing. - fileName: app_tuning_metrics.csv + fileName: application_tuning_metrics.csv scope: per-app columns: - name: appId diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 7079220af..6bfe64554 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -547,15 +547,14 @@ object AppTuningMetricsProfileResult { def apply( appId: String, - maxTaskInputBytesRead: Double, - maxColumnarExchangeDataSizeBytes: Option[Long], + maxTaskInputBytesRead: Long, + maxColumnarExchangeDataSizeBytes: Long, scanStagesWithGpuOom: Set[Long], shuffleStagesWithOom: Set[Long]): AppTuningMetricsProfileResult = { AppTuningMetricsProfileResult( appId = appId, - maxTaskInputBytesRead = f"$maxTaskInputBytesRead%.0f", - maxColumnarExchangeDataSizeBytes = - maxColumnarExchangeDataSizeBytes.map(_.toString).getOrElse(""), + maxTaskInputBytesRead = maxTaskInputBytesRead.toString, + maxColumnarExchangeDataSizeBytes = maxColumnarExchangeDataSizeBytes.toString, scanStagesWithGpuOom = stageIdsToStr(scanStagesWithGpuOom), shuffleStagesWithOom = stageIdsToStr(shuffleStagesWithOom)) } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index c794b9703..734cb2393 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -305,11 +305,11 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val failedTasks = healthCheck.getFailedTasks val failedStages = healthCheck.getFailedStages - // Compute AutoTuner inputs for app_tuning_metrics.csv + // Compute AutoTuner inputs for application_tuning_metrics.csv val singleApp = analyzedApps.head val pluginEnabled = singleApp.gpuMode val maxTaskInput = analysis.maxTaskInputSizes.headOption - .map(_.maxTaskInputBytesRead).getOrElse(0.0) + .map(_.maxTaskInputBytesRead.toLong).getOrElse(0L) val maxColumnarExchange = SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(sqlMetrics) val scanOomStages = SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( @@ -321,7 +321,8 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val appInfo = collect.getAppInfo val appId = appInfo.headOption.flatMap(_.appId).getOrElse("") val tuningMetrics = Seq(AppTuningMetricsProfileResult( - appId, maxTaskInput, maxColumnarExchange, scanOomStages, shuffleOomStages)) + appId, maxTaskInput, maxColumnarExchange.getOrElse(0L), + scanOomStages, shuffleOomStages)) logDebug(s"Time to collect Profiling Info [$appId]: ${endTime - startTime}.") val appInfoSummary = ApplicationSummaryInfo( @@ -428,7 +429,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea // writeOps are generated in only CSV format profileOutputWriter.writeCSVTable(ProfWriteOpsView.getLabel, app.writeOpsInfo) profileOutputWriter.writeCSVTable(TASK_SHUFFLE_SKEW, app.skewInfo) - profileOutputWriter.writeCSVTable(APP_TUNING_METRICS, app.appTuningMetrics) + profileOutputWriter.writeCSVTable(APPLICATION_TUNING_METRICS, app.appTuningMetrics) profileOutputWriter.writeText("\n### C. Health Check###\n") profileOutputWriter.writeCSVTable(ProfFailedTaskView.getLabel, app.failedTasks) profileOutputWriter.writeTable(ProfFailedStageView.getLabel, app.failedStages) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index 3f2a4cd22..97a8ca40b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -72,11 +72,11 @@ object QualRawReportGenerator extends Logging { new ProfileOutputWriter(metricsDirectory, "profile", 10000000, outputCSV = true) try { // Compute aggregate metrics early so maxTaskInputBytesRead is available for - // app_tuning_metrics.csv + // application_tuning_metrics.csv val aggRawMetrics = QualSparkMetricsAggregator .getAggRawMetrics(app, sqlAnalyzer = Some(sqlPlanAnalyzer)) val maxTaskInput = aggRawMetrics.maxTaskInputSizes.headOption - .map(_.maxTaskInputBytesRead).getOrElse(0.0) + .map(_.maxTaskInputBytesRead.toLong).getOrElse(0L) pWriter.writeText("### A. Information Collected ###") pWriter.writeTable( @@ -107,10 +107,10 @@ object QualRawReportGenerator extends Logging { constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } - // Write tuning metrics (GPU-only fields are empty for qualification) + // Write tuning metrics (GPU-only fields default to 0/empty for qualification) val tuningMetrics = Seq(AppTuningMetricsProfileResult( - app.appId, maxTaskInput, Option.empty[Long], Set.empty[Long], Set.empty[Long])) - pWriter.writeCSVTable(APP_TUNING_METRICS, tuningMetrics) + app.appId, maxTaskInput, 0L, Set.empty[Long], Set.empty[Long])) + pWriter.writeCSVTable(APPLICATION_TUNING_METRICS, tuningMetrics) pWriter.writeText("\n### C. Health Check###\n") pWriter.writeCSVTable(QualFailedTaskView.getLabel, QualFailedTaskView.getRawView(Seq(app))) pWriter.writeTable( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala index 60a5f28cf..9a0a22725 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala @@ -29,7 +29,7 @@ package object views { val SQL_DUR_LABEL = "SQL Duration and Executor CPU Time Percent" val SQL_MAX_INPUT_SIZE = "SQL Max Task Input Size" val STAGE_DIAGNOSTICS_LABEL = "Stage Level Diagnostic Metrics" - val APP_TUNING_METRICS = "App Tuning Metrics" + val APPLICATION_TUNING_METRICS = "Application Tuning Metrics" val CLUSTER_INFORMATION_LABEL = "Cluster Information" val AGG_DESCRIPTION = Map( From e41bd1708ef9cb7b4a298271fcd5e58875fa8971 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 02:44:04 -0700 Subject: [PATCH 06/16] Remove redundant comments in ApplicationSummaryInfo.scala and QualRawReportGenerator.scala to enhance code clarity and maintainability. --- .../spark/rapids/tool/profiling/ApplicationSummaryInfo.scala | 3 --- .../spark/rapids/tool/views/QualRawReportGenerator.scala | 1 - 2 files changed, 4 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index 3c50ee477..5965e28c0 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -255,7 +255,6 @@ class SingleAppSummaryInfoProvider( object SingleAppSummaryInfoProvider { /** * Computes the set of scan stage IDs that had GPU OOM failures. - * This static method enables computing the result before ApplicationSummaryInfo is assembled. */ def computeScanStagesWithGpuOom( pluginEnabled: Boolean, @@ -293,7 +292,6 @@ object SingleAppSummaryInfoProvider { * Computes the set of shuffle stage IDs that had container OOM failures (YARN only). * Detects ExecutorLostFailure with exit code 137 (SIGKILL from container memory enforcement). * See: https://github.com/NVIDIA/spark-rapids-tools/issues/1566 - * This static method enables computing the result before ApplicationSummaryInfo is assembled. */ def computeShuffleStagesWithOom( pluginEnabled: Boolean, @@ -341,7 +339,6 @@ object SingleAppSummaryInfoProvider { /** * Computes the maximum data size from ColumnarExchange metrics. - * This static method enables computing the result before ApplicationSummaryInfo is assembled. */ def computeMaxColumnarExchangeDataSizeBytes( sqlMetrics: Seq[SQLAccumProfileResults]): Option[Long] = { diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index 97a8ca40b..ac0cb2afa 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -103,7 +103,6 @@ object QualRawReportGenerator extends Logging { SystemQualPropertiesView.getRawView(Seq(app)), Some(SystemQualPropertiesView.getDescription)) pWriter.writeText("\n### B. Analysis ###\n") - // Reuse already-computed aggRawMetrics instead of computing again constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } From 9652fc3494951d52fa4dec06551644cb9ce31e7b Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 02:53:27 -0700 Subject: [PATCH 07/16] Update copyright year in package.scala to 2024-2026 --- .../main/scala/com/nvidia/spark/rapids/tool/views/package.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala index 9a0a22725..33a812eac 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 30ccfc3bb3c3aab41a579acb9294721937b64dd2 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 08:27:24 -0700 Subject: [PATCH 08/16] Revert --csv default to opt-in Keep the original Scallop default (false) to avoid changing CLI behavior. Qualification always writes CSVs via ProfileOutputWriter(outputCSV = true) so application_tuning_metrics.csv is still produced there unconditionally. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala index 076a50dd3..84349f148 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala @@ -89,7 +89,7 @@ Usage: java -cp rapids-4-spark-tools_2.12-.jar:$SPARK_HOME/jars/* descr = "Number of thread to use for parallel processing. The default is the " + "number of cores on host divided by 4.") val csv: ScallopOption[Boolean] = - opt[Boolean](required = false, default = Some(true), + opt[Boolean](required = false, descr = "Output each table to a CSV file as well creating the summary text file.") val timeout: ScallopOption[Long] = opt[Long](required = false, From baa8a80e009fc6e56544ebfedc2ab711a788eff3 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 08:30:13 -0700 Subject: [PATCH 09/16] Restore ProfileArgs.scala to origin/dev (no copyright bump) File is unchanged functionally vs dev; skip the auto-copyrighter hook to keep the copyright year at 2025 since this PR does not modify the file. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala index 84349f148..81a253ebf 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileArgs.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2026, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From aec9fd21694ecbde408295e6003716a79f1297df Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 11:02:18 -0700 Subject: [PATCH 10/16] Restructure tuning metrics as vertical tuning_signals.csv - Rename output file application_tuning_metrics.csv -> tuning_signals.csv and switch to vertical metricName,value layout (matches spark_properties.csv). appId is dropped since the file lives in a per-app directory. - Replace AppTuningMetricsProfileResult with a row-shaped TuningSignalProfileResult plus a builder that emits one row per metric. Simplifies adding new signals. - Rename trait method shuffleStagesWithOom -> gpuShuffleStagesWithContainerOom to make the GPU-only gating explicit and distinguish container-level (YARN SIGKILL) OOM from device-level GPU OOM. - Drop Gpu prefix on the static compute helper (computeShuffleStagesWithContainerOom) since it lives in the profiling-only SingleAppSummaryInfoProvider companion. - Update YAML schema, OutHeaderRegistry, Profiler/QualRawReportGenerator call sites, AutoTuner consumer, and test mocks accordingly. - ToolsAPI auto-discovers the new coreRawTuningSignalsCSV label via the YAML; no Python changes needed. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 34 +++++-------- .../profiling/ApplicationSummaryInfo.scala | 10 ++-- .../profiling/ProfileClassWarehouse.scala | 48 ++++++++----------- .../rapids/tool/profiling/Profiler.scala | 17 +++---- .../spark/rapids/tool/tuning/AutoTuner.scala | 2 +- .../rapids/tool/views/OutHeaderRegistry.scala | 5 +- .../tool/views/QualRawReportGenerator.scala | 10 ++-- .../spark/rapids/tool/views/package.scala | 2 +- .../tool/tuning/BaseAutoTunerSuite.scala | 8 ++-- .../tool/tuning/ProfilingAutoTunerSuite.scala | 4 +- .../tuning/ProfilingAutoTunerSuiteV2.scala | 2 +- 11 files changed, 62 insertions(+), 80 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index d6ebb0765..4d324ed23 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -18,7 +18,6 @@ # / # ├── application_information.csv # ├── application_log_path_mapping.csv -# ├── application_tuning_metrics.csv # ├── cluster_information.json # ├── data_source_information.csv # ├── executor_information.csv @@ -41,6 +40,7 @@ # ├── stage_level_aggregated_task_metrics.csv # ├── stage_level_all_metrics.csv # ├── system_properties.csv +# ├── tuning_signals.csv # ├── unsupported_sql_plan.csv # ├── wholestagecodegen_mapping.csv # ├── write_operations.csv @@ -1427,30 +1427,22 @@ reportDefinitions: fileName: profile.log fileFormat: TXT scope: per-app - # AppTuningMetricsProfileResult - - label: coreRawAppTuningMetricsCSV + # TuningSignalProfileResult + - label: coreRawTuningSignalsCSV description: >- - Per-app metrics used by the AutoTuner for partition sizing and cluster sizing. - fileName: application_tuning_metrics.csv + Per-app diagnostic signals used as AutoTuner inputs (partition sizing, cluster + sizing). Vertical key-value layout; each emitted metric is one row. Current + rows: maxTaskInputBytesRead, maxColumnarExchangeDataSizeBytes (profiling only), + scanStagesWithGpuOom (profiling only), gpuShuffleStagesWithContainerOom (profiling only). + fileName: tuning_signals.csv scope: per-app columns: - - name: appId - dataType: String - description: >- - Application ID - - name: maxTaskInputBytesRead - dataType: Long - description: >- - Maximum per-task input bytes read across all SQLs - - name: maxColumnarExchangeDataSizeBytes - dataType: Long - description: >- - Maximum data size from ColumnarExchange metrics (profiling only) - - name: scanStagesWithGpuOom + - name: metricName dataType: String description: >- - Comma-separated stage IDs of scan stages with GPU OOM (profiling only) - - name: shuffleStagesWithOom + Name of the tuning signal + - name: value dataType: String description: >- - Comma-separated stage IDs of shuffle stages with OOM (profiling only) + Signal value. Numeric signals are stored as their integer string form; + stage-ID signals are comma-separated lists (empty string when none). diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index 5965e28c0..eba1ff24b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -55,7 +55,7 @@ case class ApplicationSummaryInfo( sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent], writeOpsInfo: Seq[WriteOpProfileResult], sqlPlanInfo: Seq[SQLPlanInfoProfileResult], - appTuningMetrics: Seq[AppTuningMetricsProfileResult] = Seq.empty) + tuningSignals: Seq[TuningSignalProfileResult] = Seq.empty) trait AppInfoPropertyGetter { // returns all the properties (i.e., spark) @@ -92,7 +92,7 @@ trait AppInfoReadMetrics { trait AppInfoGpuOomCheck { def scanStagesWithGpuOom: Set[Long] = Set.empty - def shuffleStagesWithOom: Set[Long] = Set.empty + def gpuShuffleStagesWithContainerOom: Set[Long] = Set.empty } trait AppInfoColumnarExchangeMetrics { @@ -236,8 +236,8 @@ class SingleAppSummaryInfoProvider( app.failedTasks, app.stageMetrics, appInfo) } - override def shuffleStagesWithOom: Set[Long] = { - SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( + override def gpuShuffleStagesWithContainerOom: Set[Long] = { + SingleAppSummaryInfoProvider.computeShuffleStagesWithContainerOom( app.appInfo.exists(_.pluginEnabled), getSparkProperty("spark.master"), app.failedStages, app.failedTasks) @@ -293,7 +293,7 @@ object SingleAppSummaryInfoProvider { * Detects ExecutorLostFailure with exit code 137 (SIGKILL from container memory enforcement). * See: https://github.com/NVIDIA/spark-rapids-tools/issues/1566 */ - def computeShuffleStagesWithOom( + def computeShuffleStagesWithContainerOom( pluginEnabled: Boolean, sparkMasterStr: Option[String], failedStages: Seq[FailedStagesProfileResults], diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 6bfe64554..4af4676f0 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -516,48 +516,38 @@ case class AppInfoProfileResults( } } -case class AppTuningMetricsProfileResult( - appId: String, - maxTaskInputBytesRead: String, - maxColumnarExchangeDataSizeBytes: String, - scanStagesWithGpuOom: String, - shuffleStagesWithOom: String) extends ProfileResult { +case class TuningSignalProfileResult( + metricName: String, + value: String) extends ProfileResult { override def outputHeaders: Array[String] = { - OutHeaderRegistry.outputHeaders("AppTuningMetricsProfileResult") + OutHeaderRegistry.outputHeaders("TuningSignalProfileResult") } - override def convertToSeq(): Array[String] = { - Array(appId, maxTaskInputBytesRead, maxColumnarExchangeDataSizeBytes, - scanStagesWithGpuOom, shuffleStagesWithOom) - } + override def convertToSeq(): Array[String] = Array(metricName, value) - override def convertToCSVSeq(): Array[String] = { - Array(StringUtils.reformatCSVString(appId), - maxTaskInputBytesRead, - maxColumnarExchangeDataSizeBytes, - StringUtils.reformatCSVString(scanStagesWithGpuOom), - StringUtils.reformatCSVString(shuffleStagesWithOom)) - } + override def convertToCSVSeq(): Array[String] = Array( + StringUtils.reformatCSVString(metricName), + StringUtils.reformatCSVString(value)) } -object AppTuningMetricsProfileResult { +object TuningSignalProfileResult { private def stageIdsToStr(stageIds: Set[Long]): String = { if (stageIds.isEmpty) "" else stageIds.toSeq.sorted.mkString(",") } - def apply( - appId: String, + def build( maxTaskInputBytesRead: Long, maxColumnarExchangeDataSizeBytes: Long, scanStagesWithGpuOom: Set[Long], - shuffleStagesWithOom: Set[Long]): AppTuningMetricsProfileResult = { - AppTuningMetricsProfileResult( - appId = appId, - maxTaskInputBytesRead = maxTaskInputBytesRead.toString, - maxColumnarExchangeDataSizeBytes = maxColumnarExchangeDataSizeBytes.toString, - scanStagesWithGpuOom = stageIdsToStr(scanStagesWithGpuOom), - shuffleStagesWithOom = stageIdsToStr(shuffleStagesWithOom)) - } + gpuShuffleStagesWithContainerOom: Set[Long]): Seq[TuningSignalProfileResult] = Seq( + TuningSignalProfileResult("maxTaskInputBytesRead", + maxTaskInputBytesRead.toString), + TuningSignalProfileResult("maxColumnarExchangeDataSizeBytes", + maxColumnarExchangeDataSizeBytes.toString), + TuningSignalProfileResult("scanStagesWithGpuOom", + stageIdsToStr(scanStagesWithGpuOom)), + TuningSignalProfileResult("gpuShuffleStagesWithContainerOom", + stageIdsToStr(gpuShuffleStagesWithContainerOom))) } case class AppLogPathProfileResults( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 734cb2393..1863cd60b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -314,15 +314,16 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(sqlMetrics) val scanOomStages = SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( pluginEnabled, failedTasks, stageMetrics, singleApp) - val shuffleOomStages = SingleAppSummaryInfoProvider.computeShuffleStagesWithOom( - pluginEnabled, singleApp.sparkProperties.get("spark.master"), - failedStages, failedTasks) + val gpuShuffleContainerOomStages = + SingleAppSummaryInfoProvider.computeShuffleStagesWithContainerOom( + pluginEnabled, singleApp.sparkProperties.get("spark.master"), + failedStages, failedTasks) val appInfo = collect.getAppInfo val appId = appInfo.headOption.flatMap(_.appId).getOrElse("") - val tuningMetrics = Seq(AppTuningMetricsProfileResult( - appId, maxTaskInput, maxColumnarExchange.getOrElse(0L), - scanOomStages, shuffleOomStages)) + val tuningSignals = TuningSignalProfileResult.build( + maxTaskInput, maxColumnarExchange.getOrElse(0L), + scanOomStages, gpuShuffleContainerOomStages) logDebug(s"Time to collect Profiling Info [$appId]: ${endTime - startTime}.") val appInfoSummary = ApplicationSummaryInfo( @@ -356,7 +357,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea sparkRapidsBuildInfo = collect.getSparkRapidsInfo, writeOpsInfo = collect.getWriteOperationInfo, sqlPlanInfo = collect.getSQLPlanInfoTruncated, - appTuningMetrics = tuningMetrics) + tuningSignals = tuningSignals) (appInfoSummary, DiagnosticSummaryInfo(analysis.stageDiagnostics, collect.getIODiagnosticMetrics)) } @@ -429,7 +430,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea // writeOps are generated in only CSV format profileOutputWriter.writeCSVTable(ProfWriteOpsView.getLabel, app.writeOpsInfo) profileOutputWriter.writeCSVTable(TASK_SHUFFLE_SKEW, app.skewInfo) - profileOutputWriter.writeCSVTable(APPLICATION_TUNING_METRICS, app.appTuningMetrics) + profileOutputWriter.writeCSVTable(TUNING_SIGNALS, app.tuningSignals) profileOutputWriter.writeText("\n### C. Health Check###\n") profileOutputWriter.writeCSVTable(ProfFailedTaskView.getLabel, app.failedTasks) profileOutputWriter.writeTable(ProfFailedStageView.getLabel, app.failedStages) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala index 3996cfd34..9cf961620 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala @@ -2203,7 +2203,7 @@ class ProfilingAutoTuner( */ override def recommendShufflePartitionsInternal(): Int = { val calculatedValue = super.recommendShufflePartitionsInternal() - if (appInfoProvider.shuffleStagesWithOom.nonEmpty) { + if (appInfoProvider.gpuShuffleStagesWithContainerOom.nonEmpty) { // Shuffle Stages with Task OOM detected. We may want to increase shuffle partitions. val recShufflePartitions = shufflePartitionValue * configProvider.getEntry("SHUFFLE_PARTITION_MULTIPLIER").getDefault.toInt diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index 9e7c48408..560c8731e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -70,9 +70,8 @@ object OutHeaderRegistry { "AppInfoProfileResults" -> Array("appName", "appId", "attemptId", "sparkUser", "startTime", "endTime", "duration", "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds"), - "AppTuningMetricsProfileResult" -> - Array("appId", "maxTaskInputBytesRead", "maxColumnarExchangeDataSizeBytes", - "scanStagesWithGpuOom", "shuffleStagesWithOom"), + "TuningSignalProfileResult" -> + Array("metricName", "value"), "AppLogPathProfileResults" -> Array("appName", "appId", "eventLogPath"), "FailedTaskProfileResults" -> diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index ac0cb2afa..c6f9a0cc5 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids.tool.views import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator} -import com.nvidia.spark.rapids.tool.profiling.{AppTuningMetricsProfileResult, DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults} +import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults, TuningSignalProfileResult} import org.apache.spark.internal.Logging import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo @@ -106,10 +106,10 @@ object QualRawReportGenerator extends Logging { constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } - // Write tuning metrics (GPU-only fields default to 0/empty for qualification) - val tuningMetrics = Seq(AppTuningMetricsProfileResult( - app.appId, maxTaskInput, 0L, Set.empty[Long], Set.empty[Long])) - pWriter.writeCSVTable(APPLICATION_TUNING_METRICS, tuningMetrics) + // GPU-only signals default to 0/empty for qualification (CPU event logs) + val tuningSignals = TuningSignalProfileResult.build( + maxTaskInput, 0L, Set.empty[Long], Set.empty[Long]) + pWriter.writeCSVTable(TUNING_SIGNALS, tuningSignals) pWriter.writeText("\n### C. Health Check###\n") pWriter.writeCSVTable(QualFailedTaskView.getLabel, QualFailedTaskView.getRawView(Seq(app))) pWriter.writeTable( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala index 33a812eac..972bc6fd5 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala @@ -29,7 +29,7 @@ package object views { val SQL_DUR_LABEL = "SQL Duration and Executor CPU Time Percent" val SQL_MAX_INPUT_SIZE = "SQL Max Task Input Size" val STAGE_DIAGNOSTICS_LABEL = "Stage Level Diagnostic Metrics" - val APPLICATION_TUNING_METRICS = "Application Tuning Metrics" + val TUNING_SIGNALS = "Tuning Signals" val CLUSTER_INFORMATION_LABEL = "Cluster Information" val AGG_DESCRIPTION = Map( diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala index ceffc08c7..d76ca0b38 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/BaseAutoTunerSuite.scala @@ -50,7 +50,7 @@ class AppInfoProviderMockTest(val maxInput: Double, val shuffleStagesWithPosSpilling: Set[Long], val shuffleSkewStages: Set[Long], val scanStagesWithGpuOomSet: Set[Long], - val shuffleStagesWithOomSet: Set[Long], + val gpuShuffleStagesWithContainerOomSet: Set[Long], val maxColumnarExchangeDataSizeBytes: Option[Long] = None) extends BaseProfilingAppSummaryInfoProvider { override def isAppInfoAvailable = true @@ -70,7 +70,7 @@ class AppInfoProviderMockTest(val maxInput: Double, override def getShuffleStagesWithPosSpilling: Set[Long] = shuffleStagesWithPosSpilling override def getShuffleSkewStages: Set[Long] = shuffleSkewStages override def scanStagesWithGpuOom: Set[Long] = scanStagesWithGpuOomSet - override def shuffleStagesWithOom: Set[Long] = shuffleStagesWithOomSet + override def gpuShuffleStagesWithContainerOom: Set[Long] = gpuShuffleStagesWithContainerOomSet override def getMaxColumnarExchangeDataSizeBytes: Option[Long] = maxColumnarExchangeDataSizeBytes /** @@ -136,12 +136,12 @@ abstract class BaseAutoTunerSuite extends AnyFunSuite with BeforeAndAfterEach shuffleStagesWithPosSpilling: Set[Long] = Set(), shuffleSkewStages: Set[Long] = Set(), scanStagesWithGpuOom: Set[Long] = Set(), - shuffleStagesWithOom: Set[Long] = Set(), + gpuShuffleStagesWithContainerOom: Set[Long] = Set(), maxColumnarExchangeDataSizeBytes: Option[Long] = None): AppInfoProviderMockTest = { new AppInfoProviderMockTest(maxInput, spilledMetrics, jvmGCFractions, propsFromLog, sparkVersion, rapidsJars, distinctLocationPct, redundantReadSize, meanInput, meanShuffleRead, shuffleStagesWithPosSpilling, shuffleSkewStages, scanStagesWithGpuOom, - shuffleStagesWithOom, maxColumnarExchangeDataSizeBytes) + gpuShuffleStagesWithContainerOom, maxColumnarExchangeDataSizeBytes) } /** diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala index 19ccb32cf..24583ea60 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuite.scala @@ -3292,7 +3292,7 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { "spark.plugins" -> "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.concurrentGpuTasks" -> "4") val infoProvider = getMockInfoProvider(0, Seq(0), Seq(0.0), - logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = Set(1L)) + logEventsProps, Some(testSparkVersion), gpuShuffleStagesWithContainerOom = Set(1L)) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC) // Configure cluster info using Platform's existing method @@ -3380,7 +3380,7 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { "spark.plugins" -> "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.concurrentGpuTasks" -> "4") val infoProvider = getMockInfoProvider(0, Seq(0), Seq(0.0), - logEventsProps, Some(testSparkVersion), shuffleStagesWithOom = Set(1L), + logEventsProps, Some(testSparkVersion), gpuShuffleStagesWithContainerOom = Set(1L), meanInput = 50000, meanShuffleRead = 80000) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC) diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala index 03205bc24..612ad4a62 100644 --- a/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala +++ b/core/src/test/scala/com/nvidia/spark/rapids/tool/tuning/ProfilingAutoTunerSuiteV2.scala @@ -1693,7 +1693,7 @@ class ProfilingAutoTunerSuiteV2 extends ProfilingAutoTunerSuiteBase { shuffleStagesWithPosSpilling = Set.empty, shuffleSkewStages = Set.empty, scanStagesWithGpuOom = Set.empty, - shuffleStagesWithOom = Set.empty, + gpuShuffleStagesWithContainerOom = Set.empty, maxColumnarExchangeDataSizeBytes = Some(1000L * 1024 * 1024 * 1024) // 1000GB ) val platform = PlatformFactory.createInstance(PlatformNames.DATAPROC, Some(targetClusterInfo)) From 59b1167b9067c154159714cf2440bae8eef99528 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 11:18:38 -0700 Subject: [PATCH 11/16] Rename tuning_signals column metricName -> name Shorter header matches the spark_properties.csv (propertyName, propertyValue) pattern but with 'name' since the containing CSV already conveys the 'metric/signal' context via its filename. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../resources/configs/reports/coreRawMetricsReport.yaml | 2 +- .../spark/rapids/tool/profiling/ProfileClassWarehouse.scala | 6 +++--- .../nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index 4d324ed23..6cfb7d023 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -1437,7 +1437,7 @@ reportDefinitions: fileName: tuning_signals.csv scope: per-app columns: - - name: metricName + - name: name dataType: String description: >- Name of the tuning signal diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 4af4676f0..d3a382a7c 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -517,16 +517,16 @@ case class AppInfoProfileResults( } case class TuningSignalProfileResult( - metricName: String, + name: String, value: String) extends ProfileResult { override def outputHeaders: Array[String] = { OutHeaderRegistry.outputHeaders("TuningSignalProfileResult") } - override def convertToSeq(): Array[String] = Array(metricName, value) + override def convertToSeq(): Array[String] = Array(name, value) override def convertToCSVSeq(): Array[String] = Array( - StringUtils.reformatCSVString(metricName), + StringUtils.reformatCSVString(name), StringUtils.reformatCSVString(value)) } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index 560c8731e..297665444 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -71,7 +71,7 @@ object OutHeaderRegistry { Array("appName", "appId", "attemptId", "sparkUser", "startTime", "endTime", "duration", "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds"), "TuningSignalProfileResult" -> - Array("metricName", "value"), + Array("name", "value"), "AppLogPathProfileResults" -> Array("appName", "appId", "eventLogPath"), "FailedTaskProfileResults" -> From 8ddd520f68a25b0c252ad44aad252bfd3b34e527 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 16:09:21 -0700 Subject: [PATCH 12/16] Relocate size signals to aggregated CSVs; keep OOM-only tuning_signals.csv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add input_bytesRead_max column to stage/SQL/job aggregated task metrics CSVs alongside existing _max columns (duration_max, peakExecutionMemory_max, resultSize_max). Tracked in TaskMetricsAccumRec and flows through StageAggAccum / SQLAggAccum / JobAggAccum to the three result case classes. - Drop maxTaskInputBytesRead and maxColumnarExchangeDataSizeBytes rows from tuning_signals.csv. The file now contains only the two GPU OOM signals. - Remove SQLMaxTaskInputSizes case class, maxTaskInputSizeBytesPerSQL() helper, and the maxTaskInputSizes field from AggRawMetricsResult and ApplicationSummaryInfo — consumers (getMaxInput on both providers) now derive the value from sqlTaskAggMetrics/sqlAggs via inputBytesReadMax. - AutoTuner's AQE ColumnarExchange adjustment is unchanged — it reads from in-memory sqlMetrics via SingleAppSummaryInfoProvider, not from CSV. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 18 +++++++++--- .../tool/analysis/AggRawMetricsResult.scala | 4 +-- .../analysis/AppSparkMetricsAggTrait.scala | 2 -- .../analysis/AppSparkMetricsAnalyzer.scala | 29 ++----------------- .../analysis/util/TaskMetricsAccumRec.scala | 4 +++ .../profiling/ApplicationSummaryInfo.scala | 5 ++-- .../profiling/ProfileClassWarehouse.scala | 20 +++++-------- .../rapids/tool/profiling/Profiler.scala | 9 +----- .../tuning/QualAppSummaryInfoProvider.scala | 4 +-- .../rapids/tool/views/OutHeaderRegistry.scala | 3 ++ .../tool/views/QualRawReportGenerator.scala | 9 ++---- .../rapids/tool/views/RawMetricProfView.scala | 4 +-- 12 files changed, 41 insertions(+), 70 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index 6cfb7d023..4fadf8a3f 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -373,6 +373,10 @@ reportDefinitions: dataType: Long description: >- TBD + - name: input_bytesRead_max + dataType: Long + description: >- + Maximum per-task input bytes read within the aggregation unit - name: input_recordsRead_sum dataType: Long description: >- @@ -570,6 +574,10 @@ reportDefinitions: dataType: Long description: >- TBD + - name: input_bytesRead_max + dataType: Long + description: >- + Maximum per-task input bytes read within the aggregation unit - name: input_recordsRead_sum dataType: Long description: >- @@ -783,6 +791,10 @@ reportDefinitions: dataType: Long description: >- TBD + - name: input_bytesRead_max + dataType: Long + description: >- + Maximum per-task input bytes read within the aggregation unit - name: input_recordsRead_sum dataType: Long description: >- @@ -1430,10 +1442,8 @@ reportDefinitions: # TuningSignalProfileResult - label: coreRawTuningSignalsCSV description: >- - Per-app diagnostic signals used as AutoTuner inputs (partition sizing, cluster - sizing). Vertical key-value layout; each emitted metric is one row. Current - rows: maxTaskInputBytesRead, maxColumnarExchangeDataSizeBytes (profiling only), - scanStagesWithGpuOom (profiling only), gpuShuffleStagesWithContainerOom (profiling only). + Per-app GPU-OOM diagnostic signals used as AutoTuner inputs. Vertical key-value + layout; each emitted metric is one row. fileName: tuning_signals.csv scope: per-app columns: diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala index 2584d702f..8a5f2d7ac 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala @@ -16,7 +16,7 @@ package com.nvidia.spark.rapids.tool.analysis -import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult} +import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult} /** * The result of the aggregation of the raw metrics. It contains the aggregated metrics for an @@ -31,7 +31,6 @@ import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTa * @param sqlAggs the aggregated Spark metrics for SQLs * @param ioAggs lists the SQLs along their IO metrics * @param sqlDurAggs the aggregated duration and CPU time for SQLs - * @param maxTaskInputSizes a sequence of SQLMaxTaskInputSizes that contains the maximum input size * @param stageDiagnostics the stage level Spark metrics for diagnostic purposes */ case class AggRawMetricsResult( @@ -41,5 +40,4 @@ case class AggRawMetricsResult( sqlAggs: Seq[SQLTaskAggMetricsProfileResult], ioAggs: Seq[IOAnalysisProfileResult], sqlDurAggs: Seq[SQLDurationExecutorTimeProfileResult], - maxTaskInputSizes: Seq[SQLMaxTaskInputSizes], stageDiagnostics: Seq[StageDiagnosticResult]) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala index cec8cc9b8..a8f39a168 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala @@ -45,7 +45,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait { sqlMetricsAgg, analysisObj.aggregateIOMetricsBySql(sqlMetricsAgg), analysisObj.aggregateDurationAndCPUTimeBySql(index), - Seq(analysisObj.maxTaskInputSizeBytesPerSQL(index)), analysisObj.aggregateDiagnosticMetricsByStage(index, sqlAnalyzer)) } @@ -67,7 +66,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait { agg1.sqlAggs ++ agg2.sqlAggs, agg1.ioAggs ++ agg2.ioAggs, agg1.sqlDurAggs ++ agg2.sqlDurAggs, - agg1.maxTaskInputSizes ++ agg2.maxTaskInputSizes, agg1.stageDiagnostics ++ agg2.stageDiagnostics) } } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala index ee7332cb6..1a4c3f12e 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala @@ -106,6 +106,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) { perJobRec.executorDeserializeTimeSum, perJobRec.executorRunTimeSum, perJobRec.inputBytesReadSum, + perJobRec.inputBytesReadMax, perJobRec.inputRecordsReadSum, perJobRec.jvmGCTimeSum, perJobRec.memoryBytesSpilledSum, @@ -203,6 +204,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) { preSqlRec.executorDeserializeTimeSum, preSqlRec.executorRunTimeSum, preSqlRec.inputBytesReadSum, + preSqlRec.inputBytesReadMax, preSqlRec.inputBytesReadAvg, preSqlRec.inputRecordsReadSum, preSqlRec.jvmGCTimeSum, @@ -252,32 +254,6 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) { }.toSeq } - /** - * Find the maximum task input size - * @param index App index (used by the profiler tool) - * @return a single SQLMaxTaskInputSizes record that contains the maximum value. If none, it will - * be 0L - */ - def maxTaskInputSizeBytesPerSQL(index: Int): SQLMaxTaskInputSizes = { - // TODO: We should keep maxInputSize as a field in the stageAggregate to avoid doing an - // extra path on the tasks - val maxOfSqls = app.sqlIdToStages.map { case (_, stageIds) => - // TODO: Should we only consider successful tasks? - val tasksInSQL = app.taskManager.getTasksByStageIds(stageIds) - if (tasksInSQL.isEmpty) { - 0L - } else { - tasksInSQL.map(_.input_bytesRead).max - } - } - val maxVal = if (maxOfSqls.nonEmpty) { - maxOfSqls.max - } else { - 0L - } - SQLMaxTaskInputSizes(app.appId, maxVal) - } - /** * Aggregates the duration and CPU time (milliseconds) by SQL * @param index App index (used by the profiler tool) @@ -398,6 +374,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) { perStageRec.executorDeserializeTimeSum, perStageRec.executorRunTimeSum, perStageRec.inputBytesReadSum, + perStageRec.inputBytesReadMax, perStageRec.inputRecordsReadSum, perStageRec.jvmGCTimeSum, perStageRec.memoryBytesSpilledSum, diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala index b5d98b9ac..79174e3b0 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala @@ -38,6 +38,7 @@ class TaskMetricsAccumRec { var executorDeserializeTimeSum: Long = 0 var executorRunTimeSum: Long = 0 var inputBytesReadSum: Long = 0 + var inputBytesReadMax: Long = Long.MinValue var inputRecordsReadSum: Long = 0 var jvmGCTimeSum: Long = 0 var memoryBytesSpilledSum: Long = 0 @@ -70,6 +71,7 @@ class TaskMetricsAccumRec { def resetFields(): Unit = { durationMax = 0 durationMin = 0 + inputBytesReadMax = 0 peakExecutionMemoryMax = 0 resultSizeMax = 0 } @@ -102,6 +104,7 @@ class TaskMetricsAccumRec { swWriteTimeSum += rec.sw_writeTime // Max fields durationMax = math.max(durationMax, rec.duration) + inputBytesReadMax = math.max(inputBytesReadMax, rec.input_bytesRead) peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemory) resultSizeMax = math.max(resultSizeMax, rec.resultSize) // Min Fields @@ -136,6 +139,7 @@ class TaskMetricsAccumRec { swWriteTimeSum += rec.swWriteTimeSum // Max durationMax = math.max(durationMax, rec.durationMax) + inputBytesReadMax = math.max(inputBytesReadMax, rec.inputBytesReadMax) peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemoryMax) resultSizeMax = math.max(resultSizeMax, rec.resultSizeMax) // Min diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index eba1ff24b..c8db4cced 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -47,7 +47,6 @@ case class ApplicationSummaryInfo( sparkProps: Seq[RapidsPropertyProfileResult], sqlStageInfo: Seq[SQLStageInfoProfileResult], wholeStage: Seq[WholeStageCodeGenResults], - maxTaskInputBytesRead: Seq[SQLMaxTaskInputSizes], appLogPath: Seq[AppLogPathProfileResults], ioMetrics: Seq[IOAnalysisProfileResult], sysProps: Seq[RapidsPropertyProfileResult], @@ -187,8 +186,8 @@ class SingleAppSummaryInfoProvider( } override def getMaxInput: Double = { - if (app.maxTaskInputBytesRead.nonEmpty) { - app.maxTaskInputBytesRead.head.maxTaskInputBytesRead + if (app.sqlTaskAggMetrics.nonEmpty) { + app.sqlTaskAggMetrics.map(_.inputBytesReadMax).max.toDouble } else { 0.0 } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index d3a382a7c..bd4f74677 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -536,14 +536,8 @@ object TuningSignalProfileResult { } def build( - maxTaskInputBytesRead: Long, - maxColumnarExchangeDataSizeBytes: Long, scanStagesWithGpuOom: Set[Long], gpuShuffleStagesWithContainerOom: Set[Long]): Seq[TuningSignalProfileResult] = Seq( - TuningSignalProfileResult("maxTaskInputBytesRead", - maxTaskInputBytesRead.toString), - TuningSignalProfileResult("maxColumnarExchangeDataSizeBytes", - maxColumnarExchangeDataSizeBytes.toString), TuningSignalProfileResult("scanStagesWithGpuOom", stageIdsToStr(scanStagesWithGpuOom)), TuningSignalProfileResult("gpuShuffleStagesWithContainerOom", @@ -785,6 +779,7 @@ trait BaseJobStageAggTaskMetricsProfileResult extends ProfileResult { def executorDeserializeTimeSum: Long def executorRunTimeSum: Long def inputBytesReadSum: Long + def inputBytesReadMax: Long def inputRecordsReadSum: Long def jvmGCTimeSum: Long def memoryBytesSpilledSum: Long @@ -825,6 +820,7 @@ trait BaseJobStageAggTaskMetricsProfileResult extends ProfileResult { executorDeserializeTimeSum.toString, executorRunTimeSum.toString, inputBytesReadSum.toString, + inputBytesReadMax.toString, inputRecordsReadSum.toString, jvmGCTimeSum.toString, memoryBytesSpilledSum.toString, @@ -862,6 +858,7 @@ case class JobAggTaskMetricsProfileResult( executorDeserializeTimeSum: Long, executorRunTimeSum: Long, inputBytesReadSum: Long, + inputBytesReadMax: Long, inputRecordsReadSum: Long, jvmGCTimeSum: Long, memoryBytesSpilledSum: Long, @@ -902,6 +899,7 @@ case class StageAggTaskMetricsProfileResult( executorDeserializeTimeSum: Long, executorRunTimeSum: Long, inputBytesReadSum: Long, + inputBytesReadMax: Long, inputRecordsReadSum: Long, jvmGCTimeSum: Long, memoryBytesSpilledSum: Long, @@ -956,6 +954,7 @@ case class StageAggTaskMetricsProfileResult( other.executorDeserializeTimeSum, executorRunTimeSum = this.executorRunTimeSum + other.executorRunTimeSum, inputBytesReadSum = this.inputBytesReadSum + other.inputBytesReadSum, + inputBytesReadMax = Math.max(this.inputBytesReadMax, other.inputBytesReadMax), inputRecordsReadSum = this.inputRecordsReadSum + other.inputRecordsReadSum, jvmGCTimeSum = this.jvmGCTimeSum + other.jvmGCTimeSum, memoryBytesSpilledSum = this.memoryBytesSpilledSum + other.memoryBytesSpilledSum, @@ -1118,12 +1117,6 @@ case class StageDiagnosticResult( } } -case class SQLMaxTaskInputSizes( - appId: String, - // Not added to the output since it is used only by the AutoTuner - maxTaskInputBytesRead: Double -) - case class SQLTaskAggMetricsProfileResult( appId: String, sqlId: Long, @@ -1141,6 +1134,7 @@ case class SQLTaskAggMetricsProfileResult( executorDeserializeTimeSum: Long, executorRunTimeSum: Long, inputBytesReadSum: Long, + inputBytesReadMax: Long, // Not added to the output since it is used only by the AutoTuner inputBytesReadAvg: Double, inputRecordsReadSum: Long, @@ -1189,6 +1183,7 @@ case class SQLTaskAggMetricsProfileResult( executorDeserializeTimeSum.toString, executorRunTimeSum.toString, inputBytesReadSum.toString, + inputBytesReadMax.toString, inputRecordsReadSum.toString, jvmGCTimeSum.toString, memoryBytesSpilledSum.toString, @@ -1225,6 +1220,7 @@ case class SQLTaskAggMetricsProfileResult( executorDeserializeTimeSum.toString, executorRunTimeSum.toString, inputBytesReadSum.toString, + inputBytesReadMax.toString, inputRecordsReadSum.toString, jvmGCTimeSum.toString, memoryBytesSpilledSum.toString, diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 1863cd60b..915232f45 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -293,7 +293,6 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea } } val analysis = RawMetricProfilerView.getAggMetrics(analyzedApps) - val maxTaskInputInfo = analysis.maxTaskInputSizes val sqlIdAlign = if (outputAlignedSQLIds) { collect.getSQLCleanAndAligned } else { @@ -305,13 +304,9 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val failedTasks = healthCheck.getFailedTasks val failedStages = healthCheck.getFailedStages - // Compute AutoTuner inputs for application_tuning_metrics.csv + // Compute GPU OOM signals for tuning_signals.csv val singleApp = analyzedApps.head val pluginEnabled = singleApp.gpuMode - val maxTaskInput = analysis.maxTaskInputSizes.headOption - .map(_.maxTaskInputBytesRead.toLong).getOrElse(0L) - val maxColumnarExchange = - SingleAppSummaryInfoProvider.computeMaxColumnarExchangeDataSizeBytes(sqlMetrics) val scanOomStages = SingleAppSummaryInfoProvider.computeScanStagesWithGpuOom( pluginEnabled, failedTasks, stageMetrics, singleApp) val gpuShuffleContainerOomStages = @@ -322,7 +317,6 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val appInfo = collect.getAppInfo val appId = appInfo.headOption.flatMap(_.appId).getOrElse("") val tuningSignals = TuningSignalProfileResult.build( - maxTaskInput, maxColumnarExchange.getOrElse(0L), scanOomStages, gpuShuffleContainerOomStages) logDebug(s"Time to collect Profiling Info [$appId]: ${endTime - startTime}.") @@ -349,7 +343,6 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea sparkProps = collect.getSparkProperties, sqlStageInfo = collect.getSQLToStage, wholeStage = collect.getWholeStageCodeGenMapping, - maxTaskInputBytesRead = maxTaskInputInfo, appLogPath = collect.getAppLogPath, ioMetrics = analysis.ioAggs, sysProps = collect.getSystemProperties, diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala index 8e681e70e..d6d478167 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala @@ -98,8 +98,8 @@ class QualAppSummaryInfoProvider( } override def getMaxInput: Double = { - if (rawAggMetrics.maxTaskInputSizes.nonEmpty) { - rawAggMetrics.maxTaskInputSizes.head.maxTaskInputBytesRead + if (rawAggMetrics.sqlAggs.nonEmpty) { + rawAggMetrics.sqlAggs.map(_.inputBytesReadMax).max.toDouble } else { 0.0 } diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index 297665444..f09b4eec6 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -126,6 +126,7 @@ object OutHeaderRegistry { "executorDeserializeTime_sum", "executorRunTime_sum", "input_bytesRead_sum", + "input_bytesRead_max", "input_recordsRead_sum", "jvmGCTime_sum", "memoryBytesSpilled_sum", @@ -158,6 +159,7 @@ object OutHeaderRegistry { "executorDeserializeTime_sum", "executorRunTime_sum", "input_bytesRead_sum", + "input_bytesRead_max", "input_recordsRead_sum", "jvmGCTime_sum", "memoryBytesSpilled_sum", @@ -233,6 +235,7 @@ object OutHeaderRegistry { "executorDeserializeTime_sum", "executorRunTime_sum", "input_bytesRead_sum", + "input_bytesRead_max", "input_recordsRead_sum", "jvmGCTime_sum", "memoryBytesSpilled_sum", diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index c6f9a0cc5..e7002d198 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -37,7 +37,6 @@ object QualRawReportGenerator extends Logging { AggMetricsResultSorter.sortSqlAgg(aggRawResult.sqlAggs), AggMetricsResultSorter.sortIO(aggRawResult.ioAggs), AggMetricsResultSorter.sortSqlDurationAgg(aggRawResult.sqlDurAggs), - aggRawResult.maxTaskInputSizes, AggMetricsResultSorter.sortStageDiagnostics(aggRawResult.stageDiagnostics)) Map( STAGE_AGG_LABEL -> sortedRes.stageAggs, @@ -71,12 +70,8 @@ object QualRawReportGenerator extends Logging { val pWriter = new ProfileOutputWriter(metricsDirectory, "profile", 10000000, outputCSV = true) try { - // Compute aggregate metrics early so maxTaskInputBytesRead is available for - // application_tuning_metrics.csv val aggRawMetrics = QualSparkMetricsAggregator .getAggRawMetrics(app, sqlAnalyzer = Some(sqlPlanAnalyzer)) - val maxTaskInput = aggRawMetrics.maxTaskInputSizes.headOption - .map(_.maxTaskInputBytesRead.toLong).getOrElse(0L) pWriter.writeText("### A. Information Collected ###") pWriter.writeTable( @@ -106,9 +101,9 @@ object QualRawReportGenerator extends Logging { constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } - // GPU-only signals default to 0/empty for qualification (CPU event logs) + // GPU-only signals default to empty for qualification (CPU event logs) val tuningSignals = TuningSignalProfileResult.build( - maxTaskInput, 0L, Set.empty[Long], Set.empty[Long]) + Set.empty[Long], Set.empty[Long]) pWriter.writeCSVTable(TUNING_SIGNALS, tuningSignals) pWriter.writeText("\n### C. Health Check###\n") pWriter.writeCSVTable(QualFailedTaskView.getLabel, QualFailedTaskView.getRawView(Seq(app))) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala index 2d10c8f36..a55caaf9b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids.tool.views import com.nvidia.spark.rapids.tool.analysis.ProfSparkMetricsAggregator -import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult} +import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult} import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo @@ -30,7 +30,6 @@ case class ProfilerAggregatedView( sqlAggs: Seq[SQLTaskAggMetricsProfileResult], ioAggs: Seq[IOAnalysisProfileResult], sqlDurAggs: Seq[SQLDurationExecutorTimeProfileResult], - maxTaskInputSizes: Seq[SQLMaxTaskInputSizes], stageDiagnostics: Seq[StageDiagnosticResult]) object RawMetricProfilerView { @@ -43,7 +42,6 @@ object RawMetricProfilerView { AggMetricsResultSorter.sortSqlAgg(aggMetricsResults.sqlAggs), AggMetricsResultSorter.sortIO(aggMetricsResults.ioAggs), AggMetricsResultSorter.sortSqlDurationAgg(aggMetricsResults.sqlDurAggs), - aggMetricsResults.maxTaskInputSizes, AggMetricsResultSorter.sortStageDiagnostics(aggMetricsResults.stageDiagnostics)) } } From cf9083c68b9469280f3b7f7d4bd56deaa0d8e0b0 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 16:36:38 -0700 Subject: [PATCH 13/16] Regenerate AnalysisSuite golden CSVs for new input_bytesRead_max column Updated the 9 expectation CSVs under ProfilingExpectations/ to include the new inputBytesReadMax column produced by the stage/SQL/job task metric aggregates. Values were regenerated by running AnalysisSuite against the existing event logs and writing the actual DataFrames back to the expectation files. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- ...on_db_13_3_job_metrics_agg_expectation.csv | 116 +++++++++--------- ...on_db_13_3_sql_metrics_agg_expectation.csv | 4 +- ..._db_13_3_stage_metrics_agg_expectation.csv | 116 +++++++++--------- ...in_eventlog_jobmetricsagg2_expectation.csv | 4 +- ...oin_eventlog_jobmetricsagg_expectation.csv | 4 +- ...in_eventlog_sqlmetricsagg2_expectation.csv | 4 +- ...oin_eventlog_sqlmetricsagg_expectation.csv | 4 +- ..._eventlog_stagemetricsagg2_expectation.csv | 10 +- ...n_eventlog_stagemetricsagg_expectation.csv | 10 +- 9 files changed, 136 insertions(+), 136 deletions(-) diff --git a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_job_metrics_agg_expectation.csv b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_job_metrics_agg_expectation.csv index 26d62a885..a400bd298 100644 --- a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_job_metrics_agg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_job_metrics_agg_expectation.csv @@ -1,58 +1,58 @@ -jobId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -48,431,237976,0,371230,1032,333,861.3,343507,1365,1286,367785,190131859,8639936081,160,0,0,0,169667947,21,15767,8,3,15657,3,15657,0,31314,18964,431,16 -47,432,214657,0,376777,1133,499,872.2,346447,1388,1310,373271,14007280,8639936081,144,0,0,0,159530057,11,15577,4,3,15657,3,15657,0,31314,19008,432,20 -46,433,191440,0,457364,3323,391,1056.3,352977,1639,2509,451358,5242628040,8639936081,1912,0,0,0,250840493,9,16203,551,3,15657,3,15657,0,31314,19052,433,16 -49,1,186241,0,266,266,266,266.0,86,1,1,261,0,0,0,0,0,0,138414192,0,5344,10,209,9196,222,9768,0,18964,44,1,0 -45,433,166081,0,415849,1448,339,960.4,350015,1415,1375,412139,2276478144,8639936081,568,0,0,0,195992906,2,15780,7,3,15657,3,15657,0,31314,19052,433,34 -44,431,139667,0,398973,1403,365,925.7,354332,1420,1327,395265,1075691986,8639936081,328,0,0,0,188587155,0,15767,10,3,15657,3,15657,0,31314,18964,431,17 -50,1,122711,0,267,267,267,267.0,71,1,1,262,0,0,0,0,0,0,138414192,0,5343,58,219,9636,213,9372,0,19008,44,1,0 -43,432,114755,0,403652,1369,329,934.4,353529,1424,1326,399766,1395949742,8639936081,624,0,0,0,201771890,13,15767,14,3,15657,3,15657,0,31314,19008,432,16 -51,1,97958,0,386,386,386,386.0,60,1,1,381,0,0,0,0,0,0,138414192,0,5343,154,221,9724,210,9240,0,18964,44,1,0 -42,431,89634,0,616500,1899,589,1430.4,378287,1521,1515,612098,16461920726,8639936081,4132,0,0,0,216740322,23,15805,10,3,15657,3,15657,0,31314,18964,431,16 -52,1,71718,0,384,384,384,384.0,54,1,1,379,0,0,0,0,0,0,138414192,0,5343,170,223,9812,210,9240,0,19052,44,1,0 -41,431,51085,0,759623,2321,918,1762.5,395214,1706,2027,754015,26337468742,8639936081,7772,0,0,0,250648581,87,16157,170,3,15657,3,15657,0,31314,18964,431,19 -53,1,46297,0,136,136,136,136.0,57,1,1,131,0,0,0,0,0,0,138414192,0,5344,0,214,9416,219,9636,0,19052,44,1,0 -54,1,23051,0,340,340,340,340.0,36,1,1,334,0,0,0,0,0,0,138414192,0,5343,223,215,9460,217,9548,0,19008,44,1,0 -31,1,6979,0,6738,6738,6738,6738.0,5104,128,688,6035,349526,86400,53,0,0,0,155563380,1,10759,0,0,0,0,0,0,0,7239,1800,0 -34,1,6953,0,6725,6725,6725,6725.0,479,185,677,6036,349526,86400,53,0,0,0,155563380,0,9814,0,0,0,0,0,0,0,7239,1800,0 -33,1,6940,0,6729,6729,6729,6729.0,206,216,679,6035,349526,86400,53,0,0,0,155563380,1,9896,0,0,0,0,0,0,0,7239,1800,0 -35,1,6925,0,6729,6729,6729,6729.0,157,136,681,6035,12261,1350,53,0,0,0,155199546,1,9839,0,0,0,0,0,0,0,699,165,0 -38,1,6855,0,6743,6743,6743,6743.0,187,256,688,6035,349526,86400,53,0,0,0,155563380,1,9927,0,0,0,0,0,0,0,7239,1800,0 -0,1,6033,0,5699,5699,5699,5699.0,422,948,1114,4382,0,0,37,0,0,0,0,8,2794,0,0,0,0,0,0,0,0,0,0 -13,200,5707,0,87661,966,349,438.3,9924,528,951,84265,0,0,144,0,0,0,0,9,6258,0,0,0,0,0,0,0,0,0,0 -23,200,5479,0,84240,490,355,421.2,5394,292,214,82784,0,0,136,0,0,0,0,0,6214,0,0,0,0,0,0,0,0,0,0 -21,200,5271,0,80904,485,353,404.5,6100,304,220,79384,0,0,136,0,0,0,0,1,6302,0,0,0,0,0,0,0,0,0,0 -27,200,4728,0,70760,442,309,353.8,4145,287,209,69494,0,0,152,0,0,0,0,10,5788,0,0,0,0,0,0,0,0,0,0 -3,1,4708,0,4693,4693,4693,4693.0,280,701,804,3796,0,0,26,0,0,0,0,7,2834,0,0,0,0,0,0,0,0,0,0 -25,200,4603,0,70379,569,314,351.9,4200,294,216,69040,0,0,168,0,0,0,0,14,5708,0,0,0,0,0,0,0,0,0,0 -36,1,4556,0,4332,4332,4332,4332.0,3359,95,401,3907,30328,7200,39,0,0,0,155245068,1,10552,0,0,0,0,0,0,0,7719,1920,0 -29,200,4555,0,69682,423,310,348.4,3830,272,218,68521,0,0,168,0,0,0,0,9,5748,0,0,0,0,0,0,0,0,0,0 -32,1,4515,0,4334,4334,4334,4334.0,260,130,404,3907,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 -39,1,4488,0,4322,4322,4322,4322.0,112,124,392,3907,349526,86400,39,0,0,0,155563380,1,9926,0,0,0,0,0,0,0,7239,1800,0 -37,1,4481,0,4334,4334,4334,4334.0,136,144,405,3907,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 -40,1,4476,0,4327,4327,4327,4327.0,98,147,394,3907,349526,86400,39,0,0,0,155563380,1,9895,0,0,0,0,0,0,0,7239,1800,0 -56,1,1055,0,1022,1022,1022,1022.0,758,77,95,901,0,0,0,0,0,0,134218344,6,10091,5,218,9592,220,9680,0,19272,0,0,0 -19,200,803,0,11895,145,38,59.5,1050,321,252,10017,0,0,56,0,0,0,0,22,2739,0,0,0,0,0,0,0,0,0,0 -26,1,316,0,312,312,312,312.0,2,1,1,306,0,0,0,0,0,0,0,0,3777,0,0,0,0,0,0,0,0,0,0 -2,1,280,0,267,267,267,267.0,6,4,4,124,0,0,0,0,0,0,0,1,3342,0,0,0,0,0,0,0,0,0,0 -11,1,264,0,254,254,254,254.0,5,3,3,241,0,0,0,0,0,0,0,0,2913,0,0,0,0,0,0,0,0,0,0 -7,1,240,0,227,227,227,227.0,5,4,4,213,0,0,114,0,0,0,0,0,2206,0,0,0,0,0,0,0,0,0,0 -1,1,209,0,173,173,173,173.0,28,5,6,152,0,0,0,0,0,0,0,0,2506,0,0,0,0,0,0,0,0,0,0 -5,1,179,0,165,165,165,165.0,4,4,4,151,0,0,0,0,0,0,0,0,2475,0,0,0,0,0,0,0,0,0,0 -14,1,151,0,143,143,143,143.0,3,1,1,132,0,0,0,0,0,0,0,1,3120,0,0,0,0,0,0,0,0,0,0 -4,1,147,0,139,139,139,139.0,22,5,6,121,0,0,0,0,0,0,0,0,2334,0,0,0,0,0,0,0,0,0,0 -20,1,141,0,137,137,137,137.0,1,1,1,130,0,0,0,0,0,0,0,0,2170,0,0,0,0,0,0,0,0,0,0 -28,1,140,0,136,136,136,136.0,2,1,1,130,0,0,0,0,0,0,0,0,3784,0,0,0,0,0,0,0,0,0,0 -18,1,129,0,124,124,124,124.0,2,1,1,116,0,0,0,0,0,0,0,0,2501,0,0,0,0,0,0,0,0,0,0 -16,1,125,0,117,117,117,117.0,2,1,1,108,0,0,0,0,0,0,0,0,2758,0,0,0,0,0,0,0,0,0,0 -6,1,123,0,113,113,113,113.0,4,3,3,100,0,0,0,0,0,0,0,0,2208,0,0,0,0,0,0,0,0,0,0 -10,1,120,0,110,110,110,110.0,6,3,3,98,0,0,0,0,0,0,0,0,3565,0,0,0,0,0,0,0,0,0,0 -9,1,114,0,104,104,104,104.0,5,3,3,90,0,0,0,0,0,0,0,1,3514,0,0,0,0,0,0,0,0,0,0 -12,1,105,0,85,85,85,85.0,4,3,3,72,0,0,0,0,0,0,0,0,3369,0,0,0,0,0,0,0,0,0,0 -17,1,103,0,97,97,97,97.0,2,2,2,89,0,0,0,0,0,0,0,0,3003,0,0,0,0,0,0,0,0,0,0 -8,1,102,0,95,95,95,95.0,4,3,3,82,0,0,0,0,0,0,0,0,3142,0,0,0,0,0,0,0,0,0,0 -30,1,73,0,67,67,67,67.0,2,1,1,62,0,0,0,0,0,0,0,0,3199,0,0,0,0,0,0,0,0,0,0 -24,1,72,0,59,59,59,59.0,2,1,1,51,0,0,0,0,0,0,0,0,3288,0,0,0,0,0,0,0,0,0,0 -22,1,70,0,65,65,65,65.0,2,1,1,59,0,0,0,0,0,0,0,0,3436,0,0,0,0,0,0,0,0,0,0 -55,1,65,0,54,54,54,54.0,27,1,1,49,0,0,0,0,0,0,138414192,0,5343,0,216,9504,215,9460,0,18964,44,1,0 -15,1,64,0,58,58,58,58.0,2,1,1,50,0,0,0,0,0,0,0,0,2306,0,0,0,0,0,0,0,0,0,0 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +48,431,237976,0,371230,1032,333,861.3,343507,1365,1286,367785,190131859,18373883,8639936081,160,0,0,0,169667947,21,15767,8,3,15657,3,15657,0,31314,18964,431,16 +47,432,214657,0,376777,1133,499,872.2,346447,1388,1310,373271,14007280,8235993,8639936081,144,0,0,0,159530057,11,15577,4,3,15657,3,15657,0,31314,19008,432,20 +46,433,191440,0,457364,3323,391,1056.3,352977,1639,2509,451358,5242628040,101021117,8639936081,1912,0,0,0,250840493,9,16203,551,3,15657,3,15657,0,31314,19052,433,16 +49,1,186241,0,266,266,266,266.0,86,1,1,261,0,0,0,0,0,0,0,138414192,0,5344,10,209,9196,222,9768,0,18964,44,1,0 +45,433,166081,0,415849,1448,339,960.4,350015,1415,1375,412139,2276478144,44711594,8639936081,568,0,0,0,195992906,2,15780,7,3,15657,3,15657,0,31314,19052,433,34 +44,431,139667,0,398973,1403,365,925.7,354332,1420,1327,395265,1075691986,37555235,8639936081,328,0,0,0,188587155,0,15767,10,3,15657,3,15657,0,31314,18964,431,17 +50,1,122711,0,267,267,267,267.0,71,1,1,262,0,0,0,0,0,0,0,138414192,0,5343,58,219,9636,213,9372,0,19008,44,1,0 +43,432,114755,0,403652,1369,329,934.4,353529,1424,1326,399766,1395949742,50513106,8639936081,624,0,0,0,201771890,13,15767,14,3,15657,3,15657,0,31314,19008,432,16 +51,1,97958,0,386,386,386,386.0,60,1,1,381,0,0,0,0,0,0,0,138414192,0,5343,154,221,9724,210,9240,0,18964,44,1,0 +42,431,89634,0,616500,1899,589,1430.4,378287,1521,1515,612098,16461920726,65476865,8639936081,4132,0,0,0,216740322,23,15805,10,3,15657,3,15657,0,31314,18964,431,16 +52,1,71718,0,384,384,384,384.0,54,1,1,379,0,0,0,0,0,0,0,138414192,0,5343,170,223,9812,210,9240,0,19052,44,1,0 +41,431,51085,0,759623,2321,918,1762.5,395214,1706,2027,754015,26337468742,99616661,8639936081,7772,0,0,0,250648581,87,16157,170,3,15657,3,15657,0,31314,18964,431,19 +53,1,46297,0,136,136,136,136.0,57,1,1,131,0,0,0,0,0,0,0,138414192,0,5344,0,214,9416,219,9636,0,19052,44,1,0 +54,1,23051,0,340,340,340,340.0,36,1,1,334,0,0,0,0,0,0,0,138414192,0,5343,223,215,9460,217,9548,0,19008,44,1,0 +31,1,6979,0,6738,6738,6738,6738.0,5104,128,688,6035,349526,349526,86400,53,0,0,0,155563380,1,10759,0,0,0,0,0,0,0,7239,1800,0 +34,1,6953,0,6725,6725,6725,6725.0,479,185,677,6036,349526,349526,86400,53,0,0,0,155563380,0,9814,0,0,0,0,0,0,0,7239,1800,0 +33,1,6940,0,6729,6729,6729,6729.0,206,216,679,6035,349526,349526,86400,53,0,0,0,155563380,1,9896,0,0,0,0,0,0,0,7239,1800,0 +35,1,6925,0,6729,6729,6729,6729.0,157,136,681,6035,12261,12261,1350,53,0,0,0,155199546,1,9839,0,0,0,0,0,0,0,699,165,0 +38,1,6855,0,6743,6743,6743,6743.0,187,256,688,6035,349526,349526,86400,53,0,0,0,155563380,1,9927,0,0,0,0,0,0,0,7239,1800,0 +0,1,6033,0,5699,5699,5699,5699.0,422,948,1114,4382,0,0,0,37,0,0,0,0,8,2794,0,0,0,0,0,0,0,0,0,0 +13,200,5707,0,87661,966,349,438.3,9924,528,951,84265,0,0,0,144,0,0,0,0,9,6258,0,0,0,0,0,0,0,0,0,0 +23,200,5479,0,84240,490,355,421.2,5394,292,214,82784,0,0,0,136,0,0,0,0,0,6214,0,0,0,0,0,0,0,0,0,0 +21,200,5271,0,80904,485,353,404.5,6100,304,220,79384,0,0,0,136,0,0,0,0,1,6302,0,0,0,0,0,0,0,0,0,0 +27,200,4728,0,70760,442,309,353.8,4145,287,209,69494,0,0,0,152,0,0,0,0,10,5788,0,0,0,0,0,0,0,0,0,0 +3,1,4708,0,4693,4693,4693,4693.0,280,701,804,3796,0,0,0,26,0,0,0,0,7,2834,0,0,0,0,0,0,0,0,0,0 +25,200,4603,0,70379,569,314,351.9,4200,294,216,69040,0,0,0,168,0,0,0,0,14,5708,0,0,0,0,0,0,0,0,0,0 +36,1,4556,0,4332,4332,4332,4332.0,3359,95,401,3907,30328,30328,7200,39,0,0,0,155245068,1,10552,0,0,0,0,0,0,0,7719,1920,0 +29,200,4555,0,69682,423,310,348.4,3830,272,218,68521,0,0,0,168,0,0,0,0,9,5748,0,0,0,0,0,0,0,0,0,0 +32,1,4515,0,4334,4334,4334,4334.0,260,130,404,3907,349526,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 +39,1,4488,0,4322,4322,4322,4322.0,112,124,392,3907,349526,349526,86400,39,0,0,0,155563380,1,9926,0,0,0,0,0,0,0,7239,1800,0 +37,1,4481,0,4334,4334,4334,4334.0,136,144,405,3907,349526,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 +40,1,4476,0,4327,4327,4327,4327.0,98,147,394,3907,349526,349526,86400,39,0,0,0,155563380,1,9895,0,0,0,0,0,0,0,7239,1800,0 +56,1,1055,0,1022,1022,1022,1022.0,758,77,95,901,0,0,0,0,0,0,0,134218344,6,10091,5,218,9592,220,9680,0,19272,0,0,0 +19,200,803,0,11895,145,38,59.5,1050,321,252,10017,0,0,0,56,0,0,0,0,22,2739,0,0,0,0,0,0,0,0,0,0 +26,1,316,0,312,312,312,312.0,2,1,1,306,0,0,0,0,0,0,0,0,0,3777,0,0,0,0,0,0,0,0,0,0 +2,1,280,0,267,267,267,267.0,6,4,4,124,0,0,0,0,0,0,0,0,1,3342,0,0,0,0,0,0,0,0,0,0 +11,1,264,0,254,254,254,254.0,5,3,3,241,0,0,0,0,0,0,0,0,0,2913,0,0,0,0,0,0,0,0,0,0 +7,1,240,0,227,227,227,227.0,5,4,4,213,0,0,0,114,0,0,0,0,0,2206,0,0,0,0,0,0,0,0,0,0 +1,1,209,0,173,173,173,173.0,28,5,6,152,0,0,0,0,0,0,0,0,0,2506,0,0,0,0,0,0,0,0,0,0 +5,1,179,0,165,165,165,165.0,4,4,4,151,0,0,0,0,0,0,0,0,0,2475,0,0,0,0,0,0,0,0,0,0 +14,1,151,0,143,143,143,143.0,3,1,1,132,0,0,0,0,0,0,0,0,1,3120,0,0,0,0,0,0,0,0,0,0 +4,1,147,0,139,139,139,139.0,22,5,6,121,0,0,0,0,0,0,0,0,0,2334,0,0,0,0,0,0,0,0,0,0 +20,1,141,0,137,137,137,137.0,1,1,1,130,0,0,0,0,0,0,0,0,0,2170,0,0,0,0,0,0,0,0,0,0 +28,1,140,0,136,136,136,136.0,2,1,1,130,0,0,0,0,0,0,0,0,0,3784,0,0,0,0,0,0,0,0,0,0 +18,1,129,0,124,124,124,124.0,2,1,1,116,0,0,0,0,0,0,0,0,0,2501,0,0,0,0,0,0,0,0,0,0 +16,1,125,0,117,117,117,117.0,2,1,1,108,0,0,0,0,0,0,0,0,0,2758,0,0,0,0,0,0,0,0,0,0 +6,1,123,0,113,113,113,113.0,4,3,3,100,0,0,0,0,0,0,0,0,0,2208,0,0,0,0,0,0,0,0,0,0 +10,1,120,0,110,110,110,110.0,6,3,3,98,0,0,0,0,0,0,0,0,0,3565,0,0,0,0,0,0,0,0,0,0 +9,1,114,0,104,104,104,104.0,5,3,3,90,0,0,0,0,0,0,0,0,1,3514,0,0,0,0,0,0,0,0,0,0 +12,1,105,0,85,85,85,85.0,4,3,3,72,0,0,0,0,0,0,0,0,0,3369,0,0,0,0,0,0,0,0,0,0 +17,1,103,0,97,97,97,97.0,2,2,2,89,0,0,0,0,0,0,0,0,0,3003,0,0,0,0,0,0,0,0,0,0 +8,1,102,0,95,95,95,95.0,4,3,3,82,0,0,0,0,0,0,0,0,0,3142,0,0,0,0,0,0,0,0,0,0 +30,1,73,0,67,67,67,67.0,2,1,1,62,0,0,0,0,0,0,0,0,0,3199,0,0,0,0,0,0,0,0,0,0 +24,1,72,0,59,59,59,59.0,2,1,1,51,0,0,0,0,0,0,0,0,0,3288,0,0,0,0,0,0,0,0,0,0 +22,1,70,0,65,65,65,65.0,2,1,1,59,0,0,0,0,0,0,0,0,0,3436,0,0,0,0,0,0,0,0,0,0 +55,1,65,0,54,54,54,54.0,27,1,1,49,0,0,0,0,0,0,0,138414192,0,5343,0,216,9504,215,9460,0,18964,44,1,0 +15,1,64,0,58,58,58,58.0,2,1,1,50,0,0,0,0,0,0,0,0,0,2306,0,0,0,0,0,0,0,0,0,0 diff --git a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_sql_metrics_agg_expectation.csv b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_sql_metrics_agg_expectation.csv index 36f5fe11d..26517710f 100644 --- a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_sql_metrics_agg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_sql_metrics_agg_expectation.csv @@ -1,2 +1,2 @@ -appID,sqlID,description,numTasks,Duration,executorCPURatio,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -"app-20240919162642-0000",26,"query88",3472,250542,75.58,0,3858136,6743,54,1111.2,2885555,13523,18186,3818106,52997115316,69120188398,16100,0,0,0,250840493,181,16203,1394,1759,201596,1750,201200,0,402796,218614,19946,154 +appId,sqlId,description,numTasks,duration,executorCpuRatio,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +app-20240919162642-0000,26,query88,3472,250542,75.58,0,3858136,6743,54,1111.2,2885555,13523,18186,3818106,52997115316,101021117,69120188398,16100,0,0,0,250840493,181,16203,1394,1759,201596,1750,201200,0,402796,218614,19946,154 diff --git a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_stage_metrics_agg_expectation.csv b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_stage_metrics_agg_expectation.csv index 0b7fd2182..7ff8071d8 100644 --- a/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_stage_metrics_agg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/nds_q88_photon_db_13_3_stage_metrics_agg_expectation.csv @@ -1,58 +1,58 @@ -stageId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -58,431,237799,0,371230,1032,333,861.3,343507,1365,1286,367785,190131859,8639936081,160,0,0,0,169667947,21,15767,8,3,15657,3,15657,0,31314,18964,431,16 -54,432,214633,0,376777,1133,499,872.2,346447,1388,1310,373271,14007280,8639936081,144,0,0,0,159530057,11,15577,4,3,15657,3,15657,0,31314,19008,432,20 -44,433,191384,0,457364,3323,391,1056.3,352977,1639,2509,451358,5242628040,8639936081,1912,0,0,0,250840493,9,16203,551,3,15657,3,15657,0,31314,19052,433,16 -61,1,186240,0,266,266,266,266.0,86,1,1,261,0,0,0,0,0,0,138414192,0,5344,10,209,9196,222,9768,0,18964,44,1,0 -46,433,166015,0,415849,1448,339,960.4,350015,1415,1375,412139,2276478144,8639936081,568,0,0,0,195992906,2,15780,7,3,15657,3,15657,0,31314,19052,433,34 -50,431,139628,0,398973,1403,365,925.7,354332,1420,1327,395265,1075691986,8639936081,328,0,0,0,188587155,0,15767,10,3,15657,3,15657,0,31314,18964,431,17 -64,1,122708,0,267,267,267,267.0,71,1,1,262,0,0,0,0,0,0,138414192,0,5343,58,219,9636,213,9372,0,19008,44,1,0 -48,432,114722,0,403652,1369,329,934.4,353529,1424,1326,399766,1395949742,8639936081,624,0,0,0,201771890,13,15767,14,3,15657,3,15657,0,31314,19008,432,16 -67,1,97957,0,386,386,386,386.0,60,1,1,381,0,0,0,0,0,0,138414192,0,5343,154,221,9724,210,9240,0,18964,44,1,0 -56,431,89600,0,616500,1899,589,1430.4,378287,1521,1515,612098,16461920726,8639936081,4132,0,0,0,216740322,23,15805,10,3,15657,3,15657,0,31314,18964,431,16 -70,1,71716,0,384,384,384,384.0,54,1,1,379,0,0,0,0,0,0,138414192,0,5343,170,223,9812,210,9240,0,19052,44,1,0 -52,431,51060,0,759623,2321,918,1762.5,395214,1706,2027,754015,26337468742,8639936081,7772,0,0,0,250648581,87,16157,170,3,15657,3,15657,0,31314,18964,431,19 -73,1,46297,0,136,136,136,136.0,57,1,1,131,0,0,0,0,0,0,138414192,0,5344,0,214,9416,219,9636,0,19052,44,1,0 -76,1,23048,0,340,340,340,340.0,36,1,1,334,0,0,0,0,0,0,138414192,0,5343,223,215,9460,217,9548,0,19008,44,1,0 -31,1,6956,0,6738,6738,6738,6738.0,5104,128,688,6035,349526,86400,53,0,0,0,155563380,1,10759,0,0,0,0,0,0,0,7239,1800,0 -32,1,6945,0,6725,6725,6725,6725.0,479,185,677,6036,349526,86400,53,0,0,0,155563380,0,9814,0,0,0,0,0,0,0,7239,1800,0 -33,1,6930,0,6729,6729,6729,6729.0,206,216,679,6035,349526,86400,53,0,0,0,155563380,1,9896,0,0,0,0,0,0,0,7239,1800,0 -34,1,6907,0,6729,6729,6729,6729.0,157,136,681,6035,12261,1350,53,0,0,0,155199546,1,9839,0,0,0,0,0,0,0,699,165,0 -38,1,6842,0,6743,6743,6743,6743.0,187,256,688,6035,349526,86400,53,0,0,0,155563380,1,9927,0,0,0,0,0,0,0,7239,1800,0 -0,1,5904,0,5699,5699,5699,5699.0,422,948,1114,4382,0,0,37,0,0,0,0,8,2794,0,0,0,0,0,0,0,0,0,0 -13,200,5697,0,87661,966,349,438.3,9924,528,951,84265,0,0,144,0,0,0,0,9,6258,0,0,0,0,0,0,0,0,0,0 -23,200,5476,0,84240,490,355,421.2,5394,292,214,82784,0,0,136,0,0,0,0,0,6214,0,0,0,0,0,0,0,0,0,0 -21,200,5265,0,80904,485,353,404.5,6100,304,220,79384,0,0,136,0,0,0,0,1,6302,0,0,0,0,0,0,0,0,0,0 -27,200,4719,0,70760,442,309,353.8,4145,287,209,69494,0,0,152,0,0,0,0,10,5788,0,0,0,0,0,0,0,0,0,0 -3,1,4701,0,4693,4693,4693,4693.0,280,701,804,3796,0,0,26,0,0,0,0,7,2834,0,0,0,0,0,0,0,0,0,0 -25,200,4599,0,70379,569,314,351.9,4200,294,216,69040,0,0,168,0,0,0,0,14,5708,0,0,0,0,0,0,0,0,0,0 -29,200,4552,0,69682,423,310,348.4,3830,272,218,68521,0,0,168,0,0,0,0,9,5748,0,0,0,0,0,0,0,0,0,0 -35,1,4525,0,4332,4332,4332,4332.0,3359,95,401,3907,30328,7200,39,0,0,0,155245068,1,10552,0,0,0,0,0,0,0,7719,1920,0 -36,1,4509,0,4334,4334,4334,4334.0,260,130,404,3907,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 -39,1,4474,0,4322,4322,4322,4322.0,112,124,392,3907,349526,86400,39,0,0,0,155563380,1,9926,0,0,0,0,0,0,0,7239,1800,0 -40,1,4469,0,4327,4327,4327,4327.0,98,147,394,3907,349526,86400,39,0,0,0,155563380,1,9895,0,0,0,0,0,0,0,7239,1800,0 -37,1,4464,0,4334,4334,4334,4334.0,136,144,405,3907,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 -107,1,1052,0,1022,1022,1022,1022.0,758,77,95,901,0,0,0,0,0,0,134218344,6,10091,5,218,9592,220,9680,0,19272,0,0,0 -19,200,794,0,11895,145,38,59.5,1050,321,252,10017,0,0,56,0,0,0,0,22,2739,0,0,0,0,0,0,0,0,0,0 -26,1,315,0,312,312,312,312.0,2,1,1,306,0,0,0,0,0,0,0,0,3777,0,0,0,0,0,0,0,0,0,0 -2,1,276,0,267,267,267,267.0,6,4,4,124,0,0,0,0,0,0,0,1,3342,0,0,0,0,0,0,0,0,0,0 -11,1,260,0,254,254,254,254.0,5,3,3,241,0,0,0,0,0,0,0,0,2913,0,0,0,0,0,0,0,0,0,0 -7,1,238,0,227,227,227,227.0,5,4,4,213,0,0,114,0,0,0,0,0,2206,0,0,0,0,0,0,0,0,0,0 -1,1,202,0,173,173,173,173.0,28,5,6,152,0,0,0,0,0,0,0,0,2506,0,0,0,0,0,0,0,0,0,0 -5,1,177,0,165,165,165,165.0,4,4,4,151,0,0,0,0,0,0,0,0,2475,0,0,0,0,0,0,0,0,0,0 -14,1,148,0,143,143,143,143.0,3,1,1,132,0,0,0,0,0,0,0,1,3120,0,0,0,0,0,0,0,0,0,0 -4,1,145,0,139,139,139,139.0,22,5,6,121,0,0,0,0,0,0,0,0,2334,0,0,0,0,0,0,0,0,0,0 -20,1,140,0,137,137,137,137.0,1,1,1,130,0,0,0,0,0,0,0,0,2170,0,0,0,0,0,0,0,0,0,0 -28,1,139,0,136,136,136,136.0,2,1,1,130,0,0,0,0,0,0,0,0,3784,0,0,0,0,0,0,0,0,0,0 -18,1,127,0,124,124,124,124.0,2,1,1,116,0,0,0,0,0,0,0,0,2501,0,0,0,0,0,0,0,0,0,0 -16,1,122,0,117,117,117,117.0,2,1,1,108,0,0,0,0,0,0,0,0,2758,0,0,0,0,0,0,0,0,0,0 -6,1,120,0,113,113,113,113.0,4,3,3,100,0,0,0,0,0,0,0,0,2208,0,0,0,0,0,0,0,0,0,0 -10,1,118,0,110,110,110,110.0,6,3,3,98,0,0,0,0,0,0,0,0,3565,0,0,0,0,0,0,0,0,0,0 -9,1,110,0,104,104,104,104.0,5,3,3,90,0,0,0,0,0,0,0,1,3514,0,0,0,0,0,0,0,0,0,0 -17,1,102,0,97,97,97,97.0,2,2,2,89,0,0,0,0,0,0,0,0,3003,0,0,0,0,0,0,0,0,0,0 -8,1,100,0,95,95,95,95.0,4,3,3,82,0,0,0,0,0,0,0,0,3142,0,0,0,0,0,0,0,0,0,0 -12,1,98,0,85,85,85,85.0,4,3,3,72,0,0,0,0,0,0,0,0,3369,0,0,0,0,0,0,0,0,0,0 -30,1,71,0,67,67,67,67.0,2,1,1,62,0,0,0,0,0,0,0,0,3199,0,0,0,0,0,0,0,0,0,0 -22,1,68,0,65,65,65,65.0,2,1,1,59,0,0,0,0,0,0,0,0,3436,0,0,0,0,0,0,0,0,0,0 -24,1,68,0,59,59,59,59.0,2,1,1,51,0,0,0,0,0,0,0,0,3288,0,0,0,0,0,0,0,0,0,0 -81,1,63,0,54,54,54,54.0,27,1,1,49,0,0,0,0,0,0,138414192,0,5343,0,216,9504,215,9460,0,18964,44,1,0 -15,1,62,0,58,58,58,58.0,2,1,1,50,0,0,0,0,0,0,0,0,2306,0,0,0,0,0,0,0,0,0,0 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +58,431,237799,0,371230,1032,333,861.3,343507,1365,1286,367785,190131859,18373883,8639936081,160,0,0,0,169667947,21,15767,8,3,15657,3,15657,0,31314,18964,431,16 +54,432,214633,0,376777,1133,499,872.2,346447,1388,1310,373271,14007280,8235993,8639936081,144,0,0,0,159530057,11,15577,4,3,15657,3,15657,0,31314,19008,432,20 +44,433,191384,0,457364,3323,391,1056.3,352977,1639,2509,451358,5242628040,101021117,8639936081,1912,0,0,0,250840493,9,16203,551,3,15657,3,15657,0,31314,19052,433,16 +61,1,186240,0,266,266,266,266.0,86,1,1,261,0,0,0,0,0,0,0,138414192,0,5344,10,209,9196,222,9768,0,18964,44,1,0 +46,433,166015,0,415849,1448,339,960.4,350015,1415,1375,412139,2276478144,44711594,8639936081,568,0,0,0,195992906,2,15780,7,3,15657,3,15657,0,31314,19052,433,34 +50,431,139628,0,398973,1403,365,925.7,354332,1420,1327,395265,1075691986,37555235,8639936081,328,0,0,0,188587155,0,15767,10,3,15657,3,15657,0,31314,18964,431,17 +64,1,122708,0,267,267,267,267.0,71,1,1,262,0,0,0,0,0,0,0,138414192,0,5343,58,219,9636,213,9372,0,19008,44,1,0 +48,432,114722,0,403652,1369,329,934.4,353529,1424,1326,399766,1395949742,50513106,8639936081,624,0,0,0,201771890,13,15767,14,3,15657,3,15657,0,31314,19008,432,16 +67,1,97957,0,386,386,386,386.0,60,1,1,381,0,0,0,0,0,0,0,138414192,0,5343,154,221,9724,210,9240,0,18964,44,1,0 +56,431,89600,0,616500,1899,589,1430.4,378287,1521,1515,612098,16461920726,65476865,8639936081,4132,0,0,0,216740322,23,15805,10,3,15657,3,15657,0,31314,18964,431,16 +70,1,71716,0,384,384,384,384.0,54,1,1,379,0,0,0,0,0,0,0,138414192,0,5343,170,223,9812,210,9240,0,19052,44,1,0 +52,431,51060,0,759623,2321,918,1762.5,395214,1706,2027,754015,26337468742,99616661,8639936081,7772,0,0,0,250648581,87,16157,170,3,15657,3,15657,0,31314,18964,431,19 +73,1,46297,0,136,136,136,136.0,57,1,1,131,0,0,0,0,0,0,0,138414192,0,5344,0,214,9416,219,9636,0,19052,44,1,0 +76,1,23048,0,340,340,340,340.0,36,1,1,334,0,0,0,0,0,0,0,138414192,0,5343,223,215,9460,217,9548,0,19008,44,1,0 +31,1,6956,0,6738,6738,6738,6738.0,5104,128,688,6035,349526,349526,86400,53,0,0,0,155563380,1,10759,0,0,0,0,0,0,0,7239,1800,0 +32,1,6945,0,6725,6725,6725,6725.0,479,185,677,6036,349526,349526,86400,53,0,0,0,155563380,0,9814,0,0,0,0,0,0,0,7239,1800,0 +33,1,6930,0,6729,6729,6729,6729.0,206,216,679,6035,349526,349526,86400,53,0,0,0,155563380,1,9896,0,0,0,0,0,0,0,7239,1800,0 +34,1,6907,0,6729,6729,6729,6729.0,157,136,681,6035,12261,12261,1350,53,0,0,0,155199546,1,9839,0,0,0,0,0,0,0,699,165,0 +38,1,6842,0,6743,6743,6743,6743.0,187,256,688,6035,349526,349526,86400,53,0,0,0,155563380,1,9927,0,0,0,0,0,0,0,7239,1800,0 +0,1,5904,0,5699,5699,5699,5699.0,422,948,1114,4382,0,0,0,37,0,0,0,0,8,2794,0,0,0,0,0,0,0,0,0,0 +13,200,5697,0,87661,966,349,438.3,9924,528,951,84265,0,0,0,144,0,0,0,0,9,6258,0,0,0,0,0,0,0,0,0,0 +23,200,5476,0,84240,490,355,421.2,5394,292,214,82784,0,0,0,136,0,0,0,0,0,6214,0,0,0,0,0,0,0,0,0,0 +21,200,5265,0,80904,485,353,404.5,6100,304,220,79384,0,0,0,136,0,0,0,0,1,6302,0,0,0,0,0,0,0,0,0,0 +27,200,4719,0,70760,442,309,353.8,4145,287,209,69494,0,0,0,152,0,0,0,0,10,5788,0,0,0,0,0,0,0,0,0,0 +3,1,4701,0,4693,4693,4693,4693.0,280,701,804,3796,0,0,0,26,0,0,0,0,7,2834,0,0,0,0,0,0,0,0,0,0 +25,200,4599,0,70379,569,314,351.9,4200,294,216,69040,0,0,0,168,0,0,0,0,14,5708,0,0,0,0,0,0,0,0,0,0 +29,200,4552,0,69682,423,310,348.4,3830,272,218,68521,0,0,0,168,0,0,0,0,9,5748,0,0,0,0,0,0,0,0,0,0 +35,1,4525,0,4332,4332,4332,4332.0,3359,95,401,3907,30328,30328,7200,39,0,0,0,155245068,1,10552,0,0,0,0,0,0,0,7719,1920,0 +36,1,4509,0,4334,4334,4334,4334.0,260,130,404,3907,349526,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 +39,1,4474,0,4322,4322,4322,4322.0,112,124,392,3907,349526,349526,86400,39,0,0,0,155563380,1,9926,0,0,0,0,0,0,0,7239,1800,0 +40,1,4469,0,4327,4327,4327,4327.0,98,147,394,3907,349526,349526,86400,39,0,0,0,155563380,1,9895,0,0,0,0,0,0,0,7239,1800,0 +37,1,4464,0,4334,4334,4334,4334.0,136,144,405,3907,349526,349526,86400,39,0,0,0,155563380,1,9851,0,0,0,0,0,0,0,7239,1800,0 +107,1,1052,0,1022,1022,1022,1022.0,758,77,95,901,0,0,0,0,0,0,0,134218344,6,10091,5,218,9592,220,9680,0,19272,0,0,0 +19,200,794,0,11895,145,38,59.5,1050,321,252,10017,0,0,0,56,0,0,0,0,22,2739,0,0,0,0,0,0,0,0,0,0 +26,1,315,0,312,312,312,312.0,2,1,1,306,0,0,0,0,0,0,0,0,0,3777,0,0,0,0,0,0,0,0,0,0 +2,1,276,0,267,267,267,267.0,6,4,4,124,0,0,0,0,0,0,0,0,1,3342,0,0,0,0,0,0,0,0,0,0 +11,1,260,0,254,254,254,254.0,5,3,3,241,0,0,0,0,0,0,0,0,0,2913,0,0,0,0,0,0,0,0,0,0 +7,1,238,0,227,227,227,227.0,5,4,4,213,0,0,0,114,0,0,0,0,0,2206,0,0,0,0,0,0,0,0,0,0 +1,1,202,0,173,173,173,173.0,28,5,6,152,0,0,0,0,0,0,0,0,0,2506,0,0,0,0,0,0,0,0,0,0 +5,1,177,0,165,165,165,165.0,4,4,4,151,0,0,0,0,0,0,0,0,0,2475,0,0,0,0,0,0,0,0,0,0 +14,1,148,0,143,143,143,143.0,3,1,1,132,0,0,0,0,0,0,0,0,1,3120,0,0,0,0,0,0,0,0,0,0 +4,1,145,0,139,139,139,139.0,22,5,6,121,0,0,0,0,0,0,0,0,0,2334,0,0,0,0,0,0,0,0,0,0 +20,1,140,0,137,137,137,137.0,1,1,1,130,0,0,0,0,0,0,0,0,0,2170,0,0,0,0,0,0,0,0,0,0 +28,1,139,0,136,136,136,136.0,2,1,1,130,0,0,0,0,0,0,0,0,0,3784,0,0,0,0,0,0,0,0,0,0 +18,1,127,0,124,124,124,124.0,2,1,1,116,0,0,0,0,0,0,0,0,0,2501,0,0,0,0,0,0,0,0,0,0 +16,1,122,0,117,117,117,117.0,2,1,1,108,0,0,0,0,0,0,0,0,0,2758,0,0,0,0,0,0,0,0,0,0 +6,1,120,0,113,113,113,113.0,4,3,3,100,0,0,0,0,0,0,0,0,0,2208,0,0,0,0,0,0,0,0,0,0 +10,1,118,0,110,110,110,110.0,6,3,3,98,0,0,0,0,0,0,0,0,0,3565,0,0,0,0,0,0,0,0,0,0 +9,1,110,0,104,104,104,104.0,5,3,3,90,0,0,0,0,0,0,0,0,1,3514,0,0,0,0,0,0,0,0,0,0 +17,1,102,0,97,97,97,97.0,2,2,2,89,0,0,0,0,0,0,0,0,0,3003,0,0,0,0,0,0,0,0,0,0 +8,1,100,0,95,95,95,95.0,4,3,3,82,0,0,0,0,0,0,0,0,0,3142,0,0,0,0,0,0,0,0,0,0 +12,1,98,0,85,85,85,85.0,4,3,3,72,0,0,0,0,0,0,0,0,0,3369,0,0,0,0,0,0,0,0,0,0 +30,1,71,0,67,67,67,67.0,2,1,1,62,0,0,0,0,0,0,0,0,0,3199,0,0,0,0,0,0,0,0,0,0 +22,1,68,0,65,65,65,65.0,2,1,1,59,0,0,0,0,0,0,0,0,0,3436,0,0,0,0,0,0,0,0,0,0 +24,1,68,0,59,59,59,59.0,2,1,1,51,0,0,0,0,0,0,0,0,0,3288,0,0,0,0,0,0,0,0,0,0 +81,1,63,0,54,54,54,54.0,27,1,1,49,0,0,0,0,0,0,0,138414192,0,5343,0,216,9504,215,9460,0,18964,44,1,0 +15,1,62,0,58,58,58,58.0,2,1,1,50,0,0,0,0,0,0,0,0,0,2306,0,0,0,0,0,0,0,0,0,0 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg2_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg2_expectation.csv index 83eaa2b17..4c53a33c8 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg2_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg2_expectation.csv @@ -1,2 +1,2 @@ -jobId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -0,213,2515,0,25761,1624,9,120.9,7151,3134,11178,13522,0,0,424,0,0,0,0,10,8075,0,2600,80279920,0,0,0,80279920,80279920,2600,901 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +0,213,2515,0,25761,1624,9,120.9,7151,3134,11178,13522,0,0,0,424,0,0,0,0,10,8075,0,2600,80279920,0,0,0,80279920,80279920,2600,901 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg_expectation.csv index 956c547a8..15772ddfb 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_jobmetricsagg_expectation.csv @@ -1,2 +1,2 @@ -jobId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -0,213,2569,0,26735,1598,10,125.5,6608,3531,12095,13414,0,0,336,0,0,0,0,8,8075,0,2600,80279908,0,0,0,80279908,80279908,2600,1001 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +0,213,2569,0,26735,1598,10,125.5,6608,3531,12095,13414,0,0,0,336,0,0,0,0,8,8075,0,2600,80279908,0,0,0,80279908,80279908,2600,1001 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg2_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg2_expectation.csv index 0efee3efd..6085ba94c 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg2_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg2_expectation.csv @@ -1,2 +1,2 @@ -appID,sqlID,description,numTasks,Duration,executorCPURatio,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -"local-1622821994212",0,"count at :28",213,3041,52.88,0,25761,1624,9,120.9,7151,3134,11178,13522,0,0,424,0,0,0,0,10,8075,0,2600,80279920,0,0,0,80279920,80279920,2600,901 +appId,sqlId,description,numTasks,duration,executorCpuRatio,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +local-1622821994212,0,count at :28,213,3041,52.88,0,25761,1624,9,120.9,7151,3134,11178,13522,0,0,0,424,0,0,0,0,10,8075,0,2600,80279920,0,0,0,80279920,80279920,2600,901 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg_expectation.csv index 8b52ac3cb..066b4c184 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_sqlmetricsagg_expectation.csv @@ -1,2 +1,2 @@ -appID,sqlID,description,numTasks,Duration,executorCPURatio,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -"local-1622814619968",0,"count at :28",213,3087,49.26,0,26735,1598,10,125.5,6608,3531,12095,13414,0,0,336,0,0,0,0,8,8075,0,2600,80279908,0,0,0,80279908,80279908,2600,1001 +appId,sqlId,description,numTasks,duration,executorCpuRatio,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +local-1622814619968,0,count at :28,213,3087,49.26,0,26735,1598,10,125.5,6608,3531,12095,13414,0,0,0,336,0,0,0,0,8,8075,0,2600,80279908,0,0,0,80279908,80279908,2600,1001 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg2_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg2_expectation.csv index 1dc0462ac..36e2b92cb 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg2_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg2_expectation.csv @@ -1,5 +1,5 @@ -stageId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -0,6,1761,0,9455,1624,1540,1575.8,2917,1287,5056,4248,0,0,228,0,0,0,0,3,2951,0,0,0,0,0,0,0,40132263,1200,376 -1,6,1666,0,9274,1621,1528,1545.7,2570,1007,5016,4099,0,0,196,0,0,0,0,4,2951,0,0,0,0,0,0,0,40132257,1200,475 -2,200,592,0,6937,221,9,34.7,1619,802,1065,5125,0,0,0,0,0,0,0,3,7402,0,2400,80264520,0,0,0,80264520,15400,200,50 -3,1,101,0,95,95,95,95.0,45,38,41,50,0,0,0,0,0,0,0,0,8075,0,200,15400,0,0,0,15400,0,0,0 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +0,6,1761,0,9455,1624,1540,1575.8,2917,1287,5056,4248,0,0,0,228,0,0,0,0,3,2951,0,0,0,0,0,0,0,40132263,1200,376 +1,6,1666,0,9274,1621,1528,1545.7,2570,1007,5016,4099,0,0,0,196,0,0,0,0,4,2951,0,0,0,0,0,0,0,40132257,1200,475 +2,200,592,0,6937,221,9,34.7,1619,802,1065,5125,0,0,0,0,0,0,0,0,3,7402,0,2400,80264520,0,0,0,80264520,15400,200,50 +3,1,101,0,95,95,95,95.0,45,38,41,50,0,0,0,0,0,0,0,0,0,8075,0,200,15400,0,0,0,15400,0,0,0 diff --git a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg_expectation.csv b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg_expectation.csv index 1a937cf9e..93a4ed36f 100644 --- a/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg_expectation.csv +++ b/core/src/test/resources/ProfilingExpectations/rapids_join_eventlog_stagemetricsagg_expectation.csv @@ -1,5 +1,5 @@ -stageId,numTasks,Duration,diskBytesSpilled_sum,duration_sum,duration_max,duration_min,duration_avg,executorCPUTime_sum,executorDeserializeCPUTime_sum,executorDeserializeTime_sum,executorRunTime_sum,input_bytesRead_sum,input_recordsRead_sum,jvmGCTime_sum,memoryBytesSpilled_sum,output_bytesWritten_sum,output_recordsWritten_sum,peakExecutionMemory_max,resultSerializationTime_sum,resultSize_max,sr_fetchWaitTime_sum,sr_localBlocksFetched_sum,sr_localBytesRead_sum,sr_remoteBlocksFetched_sum,sr_remoteBytesRead_sum,sr_remoteBytesReadToDisk_sum,sr_totalBytesRead_sum,sw_bytesWritten_sum,sw_recordsWritten_sum,sw_writeTime_sum -0,6,1743,0,9518,1598,1580,1586.3,2512,1393,5309,4043,0,0,168,0,0,0,0,3,2951,0,0,0,0,0,0,0,40132250,1200,400 -1,6,1631,0,9434,1582,1568,1572.3,2406,1067,5273,3998,0,0,168,0,0,0,0,5,2951,0,0,0,0,0,0,0,40132258,1200,508 -2,200,688,0,7705,237,10,38.5,1660,1034,1474,5337,0,0,0,0,0,0,0,0,7359,0,2400,80264508,0,0,0,80264508,15400,200,93 -3,1,83,0,78,78,78,78.0,30,37,39,36,0,0,0,0,0,0,0,0,8075,0,200,15400,0,0,0,15400,0,0,0 +id,numTasks,duration,diskBytesSpilledSum,durationSum,durationMax,durationMin,durationAvg,executorCPUTimeSum,executorDeserializeCpuTimeSum,executorDeserializeTimeSum,executorRunTimeSum,inputBytesReadSum,inputBytesReadMax,inputRecordsReadSum,jvmGCTimeSum,memoryBytesSpilledSum,outputBytesWrittenSum,outputRecordsWrittenSum,peakExecutionMemoryMax,resultSerializationTimeSum,resultSizeMax,srFetchWaitTimeSum,srLocalBlocksFetchedSum,srcLocalBytesReadSum,srRemoteBlocksFetchSum,srRemoteBytesReadSum,srRemoteBytesReadToDiskSum,srTotalBytesReadSum,swBytesWrittenSum,swRecordsWrittenSum,swWriteTimeSum +0,6,1743,0,9518,1598,1580,1586.3,2512,1393,5309,4043,0,0,0,168,0,0,0,0,3,2951,0,0,0,0,0,0,0,40132250,1200,400 +1,6,1631,0,9434,1582,1568,1572.3,2406,1067,5273,3998,0,0,0,168,0,0,0,0,5,2951,0,0,0,0,0,0,0,40132258,1200,508 +2,200,688,0,7705,237,10,38.5,1660,1034,1474,5337,0,0,0,0,0,0,0,0,0,7359,0,2400,80264508,0,0,0,80264508,15400,200,93 +3,1,83,0,78,78,78,78.0,30,37,39,36,0,0,0,0,0,0,0,0,0,8075,0,200,15400,0,0,0,15400,0,0,0 From 42c5a19455c1e5722f9debdd5854e0b3bc6d737b Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Wed, 22 Apr 2026 16:42:32 -0700 Subject: [PATCH 14/16] Bump copyright year to 2026 on modified Scala files Files touched by the tuning-signals relocation change had outdated copyright headers that the CI header check flagged as expired. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala | 2 +- .../spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala | 2 +- .../spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala | 2 +- .../spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala | 2 +- .../spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala | 2 +- .../com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala index 8a5f2d7ac..c460f3d5c 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AggRawMetricsResult.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala index a8f39a168..67b899962 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAggTrait.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala index 1a4c3f12e..c1b13463a 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/AppSparkMetricsAnalyzer.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala index 79174e3b0..ad92f98a4 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/analysis/util/TaskMetricsAccumRec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala index d6d478167..f69ffd298 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/QualAppSummaryInfoProvider.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala index a55caaf9b..3ba76db7b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/RawMetricProfView.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 6aea65e173451f893c1a8819e8a1b22b8b84a1dd Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Fri, 24 Apr 2026 14:35:10 -0700 Subject: [PATCH 15/16] Rename tuning_signals.csv -> app_level_recommendation_signals.csv (wide) Replaces the 2-row vertical tuning_signals.csv with a single-row wide app_level_recommendation_signals.csv. Addresses review feedback from @hirakendu: - Generic naming ("recommendation signals") so future qualx / tuneml features can co-locate their signals here. - Explicit granularity in the filename so per-SQL / per-stage siblings can be added later without ambiguity. Columns: appId numScanStagesWithGpuOom (profiling only, 0 for qual) numGpuShuffleStagesWithContainerOom (profiling only, 0 for qual) Comma-separated stage-ID lists are replaced by simple counts, which are aggregation-friendly and atomic. If downstream consumers need per-stage detail, the stage IDs are still available via failed_tasks.csv / failed_stages.csv cross-reference (the same data this file summarises). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../configs/reports/coreRawMetricsReport.yaml | 31 +++++++++------- .../profiling/ApplicationSummaryInfo.scala | 2 +- .../profiling/ProfileClassWarehouse.scala | 36 ++++++++++--------- .../rapids/tool/profiling/Profiler.scala | 9 ++--- .../rapids/tool/views/OutHeaderRegistry.scala | 4 +-- .../tool/views/QualRawReportGenerator.scala | 10 +++--- .../spark/rapids/tool/views/package.scala | 2 +- 7 files changed, 52 insertions(+), 42 deletions(-) diff --git a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml index 4fadf8a3f..404bc7ede 100644 --- a/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml +++ b/core/src/main/resources/configs/reports/coreRawMetricsReport.yaml @@ -16,6 +16,7 @@ # Note that all the output files in this list should be considered internal-use only. # / +# ├── app_level_recommendation_signals.csv # ├── application_information.csv # ├── application_log_path_mapping.csv # ├── cluster_information.json @@ -40,7 +41,6 @@ # ├── stage_level_aggregated_task_metrics.csv # ├── stage_level_all_metrics.csv # ├── system_properties.csv -# ├── tuning_signals.csv # ├── unsupported_sql_plan.csv # ├── wholestagecodegen_mapping.csv # ├── write_operations.csv @@ -1439,20 +1439,27 @@ reportDefinitions: fileName: profile.log fileFormat: TXT scope: per-app - # TuningSignalProfileResult - - label: coreRawTuningSignalsCSV + # AppLevelRecommendationSignalsProfileResult + - label: coreRawAppLevelRecommendationSignalsCSV description: >- - Per-app GPU-OOM diagnostic signals used as AutoTuner inputs. Vertical key-value - layout; each emitted metric is one row. - fileName: tuning_signals.csv + Per-app derived signals that feed recommendation engines (AutoTuner, qualx, + etc.). Single row per app; wide layout, one column per signal. + GPU-only signals are 0 for qualification (CPU event logs). + fileName: app_level_recommendation_signals.csv scope: per-app columns: - - name: name + - name: appId dataType: String description: >- - Name of the tuning signal - - name: value - dataType: String + Application ID. + - name: numScanStagesWithGpuOom + dataType: Int + description: >- + Number of scan stages where failed tasks had GPU OOM errors + (GpuRetryOOM / GpuSplitAndRetryOOM / jni.GpuOOM). Profiling only. + - name: numGpuShuffleStagesWithContainerOom + dataType: Int description: >- - Signal value. Numeric signals are stored as their integer string form; - stage-ID signals are comma-separated lists (empty string when none). + Number of GPU shuffle stages (GpuShuffleExchangeExec) where YARN killed + the container due to container-level OOM (ExecutorLostFailure + exit 137). + Profiling only. diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala index c8db4cced..9e1d2ebc8 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ApplicationSummaryInfo.scala @@ -54,7 +54,7 @@ case class ApplicationSummaryInfo( sparkRapidsBuildInfo: Seq[SparkRapidsBuildInfoEvent], writeOpsInfo: Seq[WriteOpProfileResult], sqlPlanInfo: Seq[SQLPlanInfoProfileResult], - tuningSignals: Seq[TuningSignalProfileResult] = Seq.empty) + appLevelRecommendationSignals: Seq[AppLevelRecommendationSignalsProfileResult] = Seq.empty) trait AppInfoPropertyGetter { // returns all the properties (i.e., spark) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index bd4f74677..7aa3cca55 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -516,32 +516,34 @@ case class AppInfoProfileResults( } } -case class TuningSignalProfileResult( - name: String, - value: String) extends ProfileResult { +case class AppLevelRecommendationSignalsProfileResult( + appId: String, + numScanStagesWithGpuOom: Int, + numGpuShuffleStagesWithContainerOom: Int) extends ProfileResult { override def outputHeaders: Array[String] = { - OutHeaderRegistry.outputHeaders("TuningSignalProfileResult") + OutHeaderRegistry.outputHeaders("AppLevelRecommendationSignalsProfileResult") } - override def convertToSeq(): Array[String] = Array(name, value) + override def convertToSeq(): Array[String] = Array( + appId, + numScanStagesWithGpuOom.toString, + numGpuShuffleStagesWithContainerOom.toString) override def convertToCSVSeq(): Array[String] = Array( - StringUtils.reformatCSVString(name), - StringUtils.reformatCSVString(value)) + StringUtils.reformatCSVString(appId), + numScanStagesWithGpuOom.toString, + numGpuShuffleStagesWithContainerOom.toString) } -object TuningSignalProfileResult { - private def stageIdsToStr(stageIds: Set[Long]): String = { - if (stageIds.isEmpty) "" else stageIds.toSeq.sorted.mkString(",") - } - +object AppLevelRecommendationSignalsProfileResult { def build( + appId: String, scanStagesWithGpuOom: Set[Long], - gpuShuffleStagesWithContainerOom: Set[Long]): Seq[TuningSignalProfileResult] = Seq( - TuningSignalProfileResult("scanStagesWithGpuOom", - stageIdsToStr(scanStagesWithGpuOom)), - TuningSignalProfileResult("gpuShuffleStagesWithContainerOom", - stageIdsToStr(gpuShuffleStagesWithContainerOom))) + gpuShuffleStagesWithContainerOom: Set[Long]): Seq[AppLevelRecommendationSignalsProfileResult] = Seq( + AppLevelRecommendationSignalsProfileResult( + appId, + scanStagesWithGpuOom.size, + gpuShuffleStagesWithContainerOom.size)) } case class AppLogPathProfileResults( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala index 915232f45..2b689397b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala @@ -316,8 +316,8 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea val appInfo = collect.getAppInfo val appId = appInfo.headOption.flatMap(_.appId).getOrElse("") - val tuningSignals = TuningSignalProfileResult.build( - scanOomStages, gpuShuffleContainerOomStages) + val appLevelRecommendationSignals = AppLevelRecommendationSignalsProfileResult.build( + appId, scanOomStages, gpuShuffleContainerOomStages) logDebug(s"Time to collect Profiling Info [$appId]: ${endTime - startTime}.") val appInfoSummary = ApplicationSummaryInfo( @@ -350,7 +350,7 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea sparkRapidsBuildInfo = collect.getSparkRapidsInfo, writeOpsInfo = collect.getWriteOperationInfo, sqlPlanInfo = collect.getSQLPlanInfoTruncated, - tuningSignals = tuningSignals) + appLevelRecommendationSignals = appLevelRecommendationSignals) (appInfoSummary, DiagnosticSummaryInfo(analysis.stageDiagnostics, collect.getIODiagnosticMetrics)) } @@ -423,7 +423,8 @@ class Profiler(hadoopConf: Configuration, appArgs: ProfileArgs, enablePB: Boolea // writeOps are generated in only CSV format profileOutputWriter.writeCSVTable(ProfWriteOpsView.getLabel, app.writeOpsInfo) profileOutputWriter.writeCSVTable(TASK_SHUFFLE_SKEW, app.skewInfo) - profileOutputWriter.writeCSVTable(TUNING_SIGNALS, app.tuningSignals) + profileOutputWriter.writeCSVTable(APP_LEVEL_RECOMMENDATION_SIGNALS, + app.appLevelRecommendationSignals) profileOutputWriter.writeText("\n### C. Health Check###\n") profileOutputWriter.writeCSVTable(ProfFailedTaskView.getLabel, app.failedTasks) profileOutputWriter.writeTable(ProfFailedStageView.getLabel, app.failedStages) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala index f09b4eec6..ac61a5d3b 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/OutHeaderRegistry.scala @@ -70,8 +70,8 @@ object OutHeaderRegistry { "AppInfoProfileResults" -> Array("appName", "appId", "attemptId", "sparkUser", "startTime", "endTime", "duration", "durationStr", "sparkRuntime", "sparkVersion", "pluginEnabled", "totalCoreSeconds"), - "TuningSignalProfileResult" -> - Array("name", "value"), + "AppLevelRecommendationSignalsProfileResult" -> + Array("appId", "numScanStagesWithGpuOom", "numGpuShuffleStagesWithContainerOom"), "AppLogPathProfileResults" -> Array("appName", "appId", "eventLogPath"), "FailedTaskProfileResults" -> diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala index e7002d198..c26a9b789 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/QualRawReportGenerator.scala @@ -17,7 +17,7 @@ package com.nvidia.spark.rapids.tool.views import com.nvidia.spark.rapids.tool.analysis.{AggRawMetricsResult, AppSQLPlanAnalyzer, QualSparkMetricsAggregator} -import com.nvidia.spark.rapids.tool.profiling.{DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults, TuningSignalProfileResult} +import com.nvidia.spark.rapids.tool.profiling.{AppLevelRecommendationSignalsProfileResult, DataSourceProfileResult, ProfileOutputWriter, ProfileResult, SQLAccumProfileResults} import org.apache.spark.internal.Logging import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo @@ -101,10 +101,10 @@ object QualRawReportGenerator extends Logging { constructLabelsMaps(aggRawMetrics).foreach { case (label, metrics) => pWriter.writeCSVTable(label, metrics) } - // GPU-only signals default to empty for qualification (CPU event logs) - val tuningSignals = TuningSignalProfileResult.build( - Set.empty[Long], Set.empty[Long]) - pWriter.writeCSVTable(TUNING_SIGNALS, tuningSignals) + // GPU-only signals default to 0 for qualification (CPU event logs) + val appLevelRecommendationSignals = AppLevelRecommendationSignalsProfileResult.build( + app.appId, Set.empty[Long], Set.empty[Long]) + pWriter.writeCSVTable(APP_LEVEL_RECOMMENDATION_SIGNALS, appLevelRecommendationSignals) pWriter.writeText("\n### C. Health Check###\n") pWriter.writeCSVTable(QualFailedTaskView.getLabel, QualFailedTaskView.getRawView(Seq(app))) pWriter.writeTable( diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala index 972bc6fd5..7251a4090 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/views/package.scala @@ -29,7 +29,7 @@ package object views { val SQL_DUR_LABEL = "SQL Duration and Executor CPU Time Percent" val SQL_MAX_INPUT_SIZE = "SQL Max Task Input Size" val STAGE_DIAGNOSTICS_LABEL = "Stage Level Diagnostic Metrics" - val TUNING_SIGNALS = "Tuning Signals" + val APP_LEVEL_RECOMMENDATION_SIGNALS = "App Level Recommendation Signals" val CLUSTER_INFORMATION_LABEL = "Cluster Information" val AGG_DESCRIPTION = Map( From 073601c066664a2afa11a94191e623d178c74eed Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Fri, 24 Apr 2026 15:03:58 -0700 Subject: [PATCH 16/16] Fix scalastyle line-length violation in AppLevelRecommendationSignalsProfileResult.build Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Partho Sarthi --- .../spark/rapids/tool/profiling/ProfileClassWarehouse.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala index 7aa3cca55..b467db8c6 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/profiling/ProfileClassWarehouse.scala @@ -539,11 +539,13 @@ object AppLevelRecommendationSignalsProfileResult { def build( appId: String, scanStagesWithGpuOom: Set[Long], - gpuShuffleStagesWithContainerOom: Set[Long]): Seq[AppLevelRecommendationSignalsProfileResult] = Seq( - AppLevelRecommendationSignalsProfileResult( + gpuShuffleStagesWithContainerOom: Set[Long]) + : Seq[AppLevelRecommendationSignalsProfileResult] = { + Seq(AppLevelRecommendationSignalsProfileResult( appId, scanStagesWithGpuOom.size, gpuShuffleStagesWithContainerOom.size)) + } } case class AppLogPathProfileResults(