Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
a3d9d75
Enhance profiling metrics with new OOM detection
parthosa Apr 8, 2026
091ae29
Update copyright years in ProfileArgs.scala and OutHeaderRegistry.sca…
parthosa Apr 8, 2026
e8a6830
Update data types and improve OOM error handling in profiling metrics
parthosa Apr 8, 2026
412b679
Extract tuning metrics into separate app_tuning_metrics.csv
parthosa Apr 16, 2026
7531a6b
Rename output file to application_tuning_metrics.csv, default values …
parthosa Apr 22, 2026
e41bd17
Remove redundant comments in ApplicationSummaryInfo.scala and QualRaw…
parthosa Apr 22, 2026
4c4f0dd
Merge remote-tracking branch 'origin' into rapids-tools-2060
parthosa Apr 22, 2026
9652fc3
Update copyright year in package.scala to 2024-2026
parthosa Apr 22, 2026
30ccfc3
Revert --csv default to opt-in
parthosa Apr 22, 2026
baa8a80
Restore ProfileArgs.scala to origin/dev (no copyright bump)
parthosa Apr 22, 2026
aec9fd2
Restructure tuning metrics as vertical tuning_signals.csv
parthosa Apr 22, 2026
59b1167
Rename tuning_signals column metricName -> name
parthosa Apr 22, 2026
8ddd520
Relocate size signals to aggregated CSVs; keep OOM-only tuning_signal…
parthosa Apr 22, 2026
cf9083c
Regenerate AnalysisSuite golden CSVs for new input_bytesRead_max column
parthosa Apr 22, 2026
42c5a19
Bump copyright year to 2026 on modified Scala files
parthosa Apr 22, 2026
6aea65e
Rename tuning_signals.csv -> app_level_recommendation_signals.csv (wide)
parthosa Apr 24, 2026
073601c
Fix scalastyle line-length violation in AppLevelRecommendationSignals…
parthosa Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# Copyright (c) 2025-2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@
# Note that all the output files in this list should be considered internal-use only.

# <application-id>/
# ├── app_level_recommendation_signals.csv
# ├── application_information.csv
# ├── application_log_path_mapping.csv
# ├── cluster_information.json
Expand Down Expand Up @@ -372,6 +373,10 @@ reportDefinitions:
dataType: Long
description: >-
TBD
- name: input_bytesRead_max
dataType: Long
description: >-
Maximum per-task input bytes read within the aggregation unit
- name: input_recordsRead_sum
dataType: Long
description: >-
Expand Down Expand Up @@ -569,6 +574,10 @@ reportDefinitions:
dataType: Long
description: >-
TBD
- name: input_bytesRead_max
dataType: Long
description: >-
Maximum per-task input bytes read within the aggregation unit
- name: input_recordsRead_sum
dataType: Long
description: >-
Expand Down Expand Up @@ -782,6 +791,10 @@ reportDefinitions:
dataType: Long
description: >-
TBD
- name: input_bytesRead_max
dataType: Long
description: >-
Maximum per-task input bytes read within the aggregation unit
- name: input_recordsRead_sum
dataType: Long
description: >-
Expand Down Expand Up @@ -1426,3 +1439,27 @@ reportDefinitions:
fileName: profile.log
fileFormat: TXT
scope: per-app
# AppLevelRecommendationSignalsProfileResult
- label: coreRawAppLevelRecommendationSignalsCSV
description: >-
Per-app derived signals that feed recommendation engines (AutoTuner, qualx,
etc.). Single row per app; wide layout, one column per signal.
GPU-only signals are 0 for qualification (CPU event logs).
fileName: app_level_recommendation_signals.csv
scope: per-app
columns:
- name: appId
dataType: String
description: >-
Application ID.
- name: numScanStagesWithGpuOom
dataType: Int
description: >-
Number of scan stages where failed tasks had GPU OOM errors
(GpuRetryOOM / GpuSplitAndRetryOOM / jni.GpuOOM). Profiling only.
- name: numGpuShuffleStagesWithContainerOom
dataType: Int
description: >-
Number of GPU shuffle stages (GpuShuffleExchangeExec) where YARN killed
the container due to container-level OOM (ExecutorLostFailure + exit 137).
Profiling only.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,7 +16,7 @@

package com.nvidia.spark.rapids.tool.analysis

import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLMaxTaskInputSizes, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult}
import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTaskMetricsProfileResult, ShuffleSkewProfileResult, SQLDurationExecutorTimeProfileResult, SQLTaskAggMetricsProfileResult, StageAggTaskMetricsProfileResult, StageDiagnosticResult}

/**
* The result of the aggregation of the raw metrics. It contains the aggregated metrics for an
Expand All @@ -31,7 +31,6 @@ import com.nvidia.spark.rapids.tool.profiling.{IOAnalysisProfileResult, JobAggTa
* @param sqlAggs the aggregated Spark metrics for SQLs
* @param ioAggs lists the SQLs along their IO metrics
* @param sqlDurAggs the aggregated duration and CPU time for SQLs
* @param maxTaskInputSizes a sequence of SQLMaxTaskInputSizes that contains the maximum input size
* @param stageDiagnostics the stage level Spark metrics for diagnostic purposes
*/
case class AggRawMetricsResult(
Expand All @@ -41,5 +40,4 @@ case class AggRawMetricsResult(
sqlAggs: Seq[SQLTaskAggMetricsProfileResult],
ioAggs: Seq[IOAnalysisProfileResult],
sqlDurAggs: Seq[SQLDurationExecutorTimeProfileResult],
maxTaskInputSizes: Seq[SQLMaxTaskInputSizes],
stageDiagnostics: Seq[StageDiagnosticResult])
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
* Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
sqlMetricsAgg,
analysisObj.aggregateIOMetricsBySql(sqlMetricsAgg),
analysisObj.aggregateDurationAndCPUTimeBySql(index),
Seq(analysisObj.maxTaskInputSizeBytesPerSQL(index)),
analysisObj.aggregateDiagnosticMetricsByStage(index, sqlAnalyzer))
}

Expand All @@ -67,7 +66,6 @@ trait AppSparkMetricsAggTrait extends AppIndexMapperTrait {
agg1.sqlAggs ++ agg2.sqlAggs,
agg1.ioAggs ++ agg2.ioAggs,
agg1.sqlDurAggs ++ agg2.sqlDurAggs,
agg1.maxTaskInputSizes ++ agg2.maxTaskInputSizes,
agg1.stageDiagnostics ++ agg2.stageDiagnostics)
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
* Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -106,6 +106,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
perJobRec.executorDeserializeTimeSum,
perJobRec.executorRunTimeSum,
perJobRec.inputBytesReadSum,
perJobRec.inputBytesReadMax,
perJobRec.inputRecordsReadSum,
perJobRec.jvmGCTimeSum,
perJobRec.memoryBytesSpilledSum,
Expand Down Expand Up @@ -203,6 +204,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
preSqlRec.executorDeserializeTimeSum,
preSqlRec.executorRunTimeSum,
preSqlRec.inputBytesReadSum,
preSqlRec.inputBytesReadMax,
preSqlRec.inputBytesReadAvg,
preSqlRec.inputRecordsReadSum,
preSqlRec.jvmGCTimeSum,
Expand Down Expand Up @@ -252,32 +254,6 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
}.toSeq
}

/**
* Find the maximum task input size
* @param index App index (used by the profiler tool)
* @return a single SQLMaxTaskInputSizes record that contains the maximum value. If none, it will
* be 0L
*/
def maxTaskInputSizeBytesPerSQL(index: Int): SQLMaxTaskInputSizes = {
// TODO: We should keep maxInputSize as a field in the stageAggregate to avoid doing an
// extra path on the tasks
val maxOfSqls = app.sqlIdToStages.map { case (_, stageIds) =>
// TODO: Should we only consider successful tasks?
val tasksInSQL = app.taskManager.getTasksByStageIds(stageIds)
if (tasksInSQL.isEmpty) {
0L
} else {
tasksInSQL.map(_.input_bytesRead).max
}
}
val maxVal = if (maxOfSqls.nonEmpty) {
maxOfSqls.max
} else {
0L
}
SQLMaxTaskInputSizes(app.appId, maxVal)
}

/**
* Aggregates the duration and CPU time (milliseconds) by SQL
* @param index App index (used by the profiler tool)
Expand Down Expand Up @@ -398,6 +374,7 @@ class AppSparkMetricsAnalyzer(app: AppBase) extends AppAnalysisBase(app) {
perStageRec.executorDeserializeTimeSum,
perStageRec.executorRunTimeSum,
perStageRec.inputBytesReadSum,
perStageRec.inputBytesReadMax,
perStageRec.inputRecordsReadSum,
perStageRec.jvmGCTimeSum,
perStageRec.memoryBytesSpilledSum,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,6 +38,7 @@ class TaskMetricsAccumRec {
var executorDeserializeTimeSum: Long = 0
var executorRunTimeSum: Long = 0
var inputBytesReadSum: Long = 0
var inputBytesReadMax: Long = Long.MinValue
var inputRecordsReadSum: Long = 0
var jvmGCTimeSum: Long = 0
var memoryBytesSpilledSum: Long = 0
Expand Down Expand Up @@ -70,6 +71,7 @@ class TaskMetricsAccumRec {
def resetFields(): Unit = {
durationMax = 0
durationMin = 0
inputBytesReadMax = 0
peakExecutionMemoryMax = 0
resultSizeMax = 0
}
Expand Down Expand Up @@ -102,6 +104,7 @@ class TaskMetricsAccumRec {
swWriteTimeSum += rec.sw_writeTime
// Max fields
durationMax = math.max(durationMax, rec.duration)
inputBytesReadMax = math.max(inputBytesReadMax, rec.input_bytesRead)
peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemory)
resultSizeMax = math.max(resultSizeMax, rec.resultSize)
// Min Fields
Expand Down Expand Up @@ -136,6 +139,7 @@ class TaskMetricsAccumRec {
swWriteTimeSum += rec.swWriteTimeSum
// Max
durationMax = math.max(durationMax, rec.durationMax)
inputBytesReadMax = math.max(inputBytesReadMax, rec.inputBytesReadMax)
peakExecutionMemoryMax = math.max(peakExecutionMemoryMax, rec.peakExecutionMemoryMax)
resultSizeMax = math.max(resultSizeMax, rec.resultSizeMax)
// Min
Expand Down
Loading