-
Notifications
You must be signed in to change notification settings - Fork 49
Drop AutoTuner recommendation for concurrentGpuTasks for apps using plugin >= 25.06 #2090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1600,6 +1600,75 @@ class ProfilingAutoTunerSuite extends ProfilingAutoTunerSuiteBase { | |
| compareOutput(expectedResults, autoTunerOutput) | ||
| } | ||
|
|
||
| // Helper that runs the AutoTuner without pre-setting `spark.rapids.sql.concurrentGpuTasks` | ||
| // so the default recommendation path is exercised. Returns the AutoTuner output string. | ||
| private def runConcurrentGpuTasksScenario( | ||
| rapidsJars: Seq[String], | ||
| enforcedProps: Map[String, String] = Map.empty, | ||
| preserveProps: List[String] = List.empty): String = { | ||
| val customProps = mutable.LinkedHashMap( | ||
| "spark.executor.cores" -> "16", | ||
| "spark.executor.memory" -> "122880MiB", | ||
| "spark.executor.memoryOverhead" -> "8396m", | ||
| "spark.rapids.memory.pinnedPool.size" -> "4096m", | ||
| "spark.rapids.shuffle.multiThreaded.reader.threads" -> "16", | ||
| "spark.rapids.shuffle.multiThreaded.writer.threads" -> "16", | ||
| "spark.rapids.sql.multiThreadedRead.numThreads" -> "20", | ||
| "spark.shuffle.manager" -> | ||
| s"com.nvidia.spark.rapids.spark$testSmVersion.RapidsShuffleManager", | ||
| "spark.sql.files.maxPartitionBytes" -> "512m", | ||
| "spark.task.resource.gpu.amount" -> "0.001") | ||
| val sparkProps = defaultDataprocProps.++(customProps) | ||
| val platform = if (enforcedProps.nonEmpty || preserveProps.nonEmpty) { | ||
| val targetClusterInfo = ToolTestUtils.buildTargetClusterInfo( | ||
| enforcedSparkProperties = enforcedProps, | ||
| preserveSparkProperties = preserveProps | ||
| ) | ||
| PlatformFactory.createInstance(PlatformNames.DATAPROC, Some(targetClusterInfo)) | ||
| } else { | ||
| PlatformFactory.createInstance(PlatformNames.DATAPROC) | ||
| } | ||
| configureEventLogClusterInfoForTest( | ||
| platform, | ||
| numCores = 32, | ||
| numWorkers = 4, | ||
| gpuCount = 2, | ||
| sparkProperties = sparkProps.toMap | ||
| ) | ||
| val autoTuner = | ||
| buildAutoTunerForTests(getGpuAppMockInfoProvider( | ||
| propsFromLog = sparkProps, | ||
| rapidsJars = rapidsJars), platform) | ||
| val (properties, comments) = autoTuner.getRecommendedProperties() | ||
| Profiler.getAutoTunerResultsAsString(properties, comments) | ||
| } | ||
|
|
||
| test("AutoTuner drops concurrentGpuTasks recommendation for plugin >= 25.06") { | ||
| val output = runConcurrentGpuTasksScenario(Seq("rapids-4-spark_2.12-25.06.0.jar")) | ||
| assert(!output.contains("spark.rapids.sql.concurrentGpuTasks"), | ||
| s"Expected no concurrentGpuTasks recommendation/comment, got:\n$output") | ||
| } | ||
|
|
||
| test("AutoTuner keeps concurrentGpuTasks recommendation for plugin < 25.06") { | ||
| val output = runConcurrentGpuTasksScenario(Seq("rapids-4-spark_2.12-25.04.0.jar")) | ||
| assert(output.contains("spark.rapids.sql.concurrentGpuTasks"), | ||
| s"Expected concurrentGpuTasks to be present, got:\n$output") | ||
| } | ||
|
|
||
| test("AutoTuner keeps concurrentGpuTasks recommendation when no plugin jar version found") { | ||
| val output = runConcurrentGpuTasksScenario(Seq.empty) | ||
| assert(output.contains("spark.rapids.sql.concurrentGpuTasks"), | ||
| s"Expected concurrentGpuTasks to be present, got:\n$output") | ||
| } | ||
|
|
||
| test("Target cluster enforced concurrentGpuTasks overrides plugin >= 25.06 drop") { | ||
| val output = runConcurrentGpuTasksScenario( | ||
| Seq("rapids-4-spark_2.12-25.08.0.jar"), | ||
| enforcedProps = Map("spark.rapids.sql.concurrentGpuTasks" -> "4")) | ||
| assert(output.contains("spark.rapids.sql.concurrentGpuTasks=4"), | ||
| s"Expected enforced concurrentGpuTasks=4 to be present, got:\n$output") | ||
| } | ||
|
|
||
| test("No recommendation when the jar pluginJar is up-to-date") { | ||
|
Comment on lines
+1667
to
1672
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
test("Target cluster preserve concurrentGpuTasks overrides plugin >= 25.06 drop") {
val output = runConcurrentGpuTasksScenario(
Seq("rapids-4-spark_2.12-25.08.0.jar"),
preserveProps = List("spark.rapids.sql.concurrentGpuTasks"))
assert(output.contains("spark.rapids.sql.concurrentGpuTasks"),
s"Expected preserved concurrentGpuTasks to be present, got:\n$output")
} |
||
| // 1. Pull the latest release from mvn. | ||
| // 2. The Autotuner finds tha the jar version is latest. No comments should be added | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
compareVersionsreturns0on failure, silently skipping recommendationToolUtils.compareVersionscatches any exception and returns0(treating the two versions as equal). Because the check is>= 0, a comparison failure is interpreted as "version is at the threshold", and the recommendation is incorrectly dropped. ThepluginVersionAutoConcurrentGpuTasksconstant is well-formed so this is very unlikely in practice, but a defensive fallback would make the intent explicit and prevent silent misbehavior on unusual version strings.