From c0431b4f57a8290769971ad498e09c189f180796 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:03:51 -0700 Subject: [PATCH 01/29] docs(spec): lightweight event log runtime detector (issue #2082) Add design spec for a Python-side detector that classifies a Spark event log as SPARK / SPARK_RAPIDS / PHOTON / AURON without running the full qualification or profiling pipeline. Scoped to single-app inputs (files and rolling directories) for V1, targeting aether's per-job pre-flight decision. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- ...-04-22-eventlog-runtime-detector-design.md | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md new file mode 100644 index 000000000..ff3bf127a --- /dev/null +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -0,0 +1,198 @@ +# Lightweight Event Log Runtime Detector — Design + +- **Issue:** [NVIDIA/spark-rapids-tools#2082](https://github.com/NVIDIA/spark-rapids-tools/issues/2082) +- **Branch:** `sbari-issue-2082` +- **Date:** 2026-04-22 +- **Status:** Design approved, ready for implementation plan + +## 1. Problem + +Today, determining whether a Spark event log came from a CPU or GPU run requires running the full qualification or profiling tool. That is expensive and wrong way to use the tools for the pre-flight decision of *which* tool to run. + +aether (a consumer of spark-rapids-tools) needs to make this decision per job before invoking either tool. Its current workaround is running one tool, reading `application_information.csv`'s `sparkRuntime` column, and picking the right tool — meaning every detection costs a full qualification run. + +## 2. Goal + +Expose a lightweight Python function that inspects the first handful of events in a Spark event log and returns the runtime classification (`SPARK`, `SPARK_RAPIDS`, `PHOTON`, `AURON`) without invoking the Scala tools pipeline. + +## 3. Non-goals (V1) + +- Replacing the Scala event log reader. The Scala reader handles multi-app directories, wildcards, comma-separated lists, malformed logs, and every CSP quirk. The Python detector is a deliberately narrow shortcut for the common single-app case. +- Public user-facing CLI. This is a dev/library-scope feature, not part of the supported `spark_rapids_user_tools` CLI surface. +- Writing any output files. Result is a Python object returned in-process. + +## 4. Consumers + +- **Primary:** aether (`aether-services`). aether already consumes `spark_rapids_tools.cmdli.tools_cli.ToolsCLI` as a library. It will import `detect_spark_runtime` directly, call it before deciding to run qualification or profiling, and branch on the `spark_runtime` value — which matches the strings it already stores in `JobRun.spark_runtime`. +- **Secondary:** internal scripting / debugging. Devs can `python -c "..."` to inspect a log. + +## 5. Public API + +Module: `user_tools/src/spark_rapids_tools/tools/eventlog_detector.py`. + +```python +from enum import Enum +from dataclasses import dataclass +from typing import Optional, Union +from spark_rapids_tools.storagelib.cspfs import BoundedCspPath + +class SparkRuntime(str, Enum): + SPARK = "SPARK" + SPARK_RAPIDS = "SPARK_RAPIDS" + PHOTON = "PHOTON" + AURON = "AURON" + +@dataclass(frozen=True) +class RuntimeInfo: + spark_runtime: SparkRuntime + app_id: Optional[str] # None if SparkListenerApplicationStart not seen + app_name: Optional[str] + spark_version: Optional[str] # from SparkListenerLogStart + event_log_path: str # concrete file actually read + source_path: str # caller input (preserves rolling-dir input) + +def detect_spark_runtime( + event_log: Union[str, BoundedCspPath], + *, + max_events_scanned: int = 1000, +) -> RuntimeInfo: ... +``` + +Enum string values match the Scala `SparkRuntime` enum exactly so aether's existing string comparisons (`JobRun.spark_runtime != 'SPARK_RAPIDS'`) work unchanged. + +## 6. Scope of input shapes (V1) + +| Input shape | Supported | Notes | +| --- | --- | --- | +| Single file (plain or compressed) | ✅ | Primary case | +| Spark native rolling dir (`eventlog_v2_*`) | ✅ | Reads first `events_1_*` chunk only | +| Databricks rolling dir (`eventlog`, `eventlog-*` files) | ✅ | Reads first file | +| Generic directory of independent logs | ❌ | Raises `UnsupportedEventLogShapeError` | +| Wildcard path | ❌ | Same | +| Comma-separated list | ❌ | Same | + +Rationale: aether's `is_multi_event_log_input()` already distinguishes single-app from multi-app inputs. Pre-flight detection is most useful for the single-app case; multi-app inputs don't bind to a single `aether_job_id` anyway. + +Compression codecs: + +| Extension | Dependency | Availability | +| --- | --- | --- | +| none, `.inprogress` | stdlib | always | +| `.gz` | stdlib `gzip` | always | +| `.zstd`, `.zst` | `zstandard` | **non-optional new dep** — existing test fixtures are zstd, common Spark default | +| `.lz4` | `lz4` | optional extra `spark-rapids-tools[compression]` | +| `.snappy` | `cramjam` | optional extra `spark-rapids-tools[compression]` | + +Missing optional codec lib → `UnsupportedCompressionError` with actionable install message. + +## 7. Module structure + +Four small, independently testable components inside `eventlog_detector.py` (plus a markers file): + +### 7.1 `_resolve_event_log_file(path) -> (source, resolved)` + +Path resolver. Turns user input into the concrete file to read. + +- File input → return as-is. +- Directory input → pattern match against Spark-native and Databricks rolling layouts; pick first chunk. +- Anything else → raise `UnsupportedEventLogShapeError`. +- Pattern matching uses `BoundedCspPath.list_dir()` semantics; no file reads for the shape decision. + +### 7.2 `_open_event_log_stream(resolved_path) -> Iterator[str]` + +Stream opener. Opens the file with the right codec and yields decoded text lines. + +- Codec chosen by extension. +- Codec libs imported lazily inside this function — missing lib raises `UnsupportedCompressionError`, does not fail the module import. +- Cloud paths handled via `BoundedCspPath.open()`; codec streams layered on top. + +### 7.3 `_scan_events(lines, max_events) -> _ScanResult` + +Event scanner. Parses lines as JSON, collects three events, stops. + +- Tracks: `SparkListenerLogStart` (→ Spark version), `SparkListenerApplicationStart` (→ appId, appName), `SparkListenerEnvironmentUpdate` (→ `Spark Properties` dict). +- Stops as soon as `SparkListenerEnvironmentUpdate` is seen AND either `SparkListenerApplicationStart` is seen or the env-update came after it (i.e., we have both). In normal Spark logs both land within the first ~20 events. +- Skips malformed JSON lines (Spark tolerates trailing partial lines in live logs). +- Hard cap at `max_events_scanned` (default 1000). Purely defensive. +- On EOF or cap with env-update seen: returns `_ScanResult` populated with whatever was found (app-start may be `None`). On EOF or cap with env-update NOT seen: raises `EventLogIncompleteError` — classification is unknown. + +### 7.4 `_classify_runtime(spark_properties) -> SparkRuntime` + +Runtime classifier. Pure function over the properties dict. See section 8 for the exact rules. + +### 7.5 `eventlog_detector_markers.py` + +Single source of truth for keys/regex/substrings. Every constant has a `# Scala source: :` comment next to it. + +## 8. Classification rules (synced with Scala) + +Priority order evaluated in Python: **PHOTON > AURON > SPARK_RAPIDS > SPARK**. This is a deliberate, deterministic Python choice; Scala's `pluginMap.values.filter(...).find(...)` iterates a `HashMap` and is non-deterministic when multiple plugins match (they don't overlap in practice). + +**SPARK_RAPIDS** — `ToolUtils.isPluginEnabled` (`core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121`): + +- `spark.plugins` contains substring `com.nvidia.spark.SQLPlugin` +- AND `spark.rapids.sql.enabled` parses as boolean true (default true if missing or unparseable) + +**AURON** — `AuronParseHelper.eval` (`core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172`): + +- `spark.sql.extensions` fullmatches regex `.*AuronSparkSessionExtension.*` +- AND `spark.auron.enabled` trimmed equals `"true"` case-insensitively (default `"true"` if missing) + +**PHOTON** — requires Databricks precondition + Photon marker: + +- Databricks precondition (`DBConditionImpl.eval`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58`): all three of `spark.databricks.clusterUsageTags.clusterAllTags`, `.clusterId`, `.clusterName` are non-empty. +- AND Photon marker (`PhotonParseHelper.extensionRegxMap`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151`): any one of these fullmatches: + - `spark.databricks.clusterUsageTags.sparkVersion` ~ `.*-photon-.*` + - `spark.databricks.clusterUsageTags.effectiveSparkVersion` ~ `.*-photon-.*` + - `spark.databricks.clusterUsageTags.sparkImageLabel` ~ `.*-photon-.*` + - `spark.databricks.clusterUsageTags.runtimeEngine` ~ `PHOTON` + +**SPARK** — none of the above. + +### Notes on fidelity + +- Scala's `String.matches(regex)` requires whole-string match → Python uses `re.fullmatch`. +- Plugin-based runtime beats gpuMode: if a log has both `com.nvidia.spark.SQLPlugin` and Photon markers, Scala returns `PHOTON`. Python priority preserves this. +- `spark.rapids.sql.enabled` default-to-true semantics match `ToolUtils.isPluginEnabled` exactly (`Try { ... }.getOrElse(true)`). + +## 9. Error model + +All errors subclass `EventLogDetectionError`: + +| Exception | Meaning | aether action | +| --- | --- | --- | +| `UnsupportedEventLogShapeError` | Input is multi-app / wildcard / comma list | Fall back to running the full tool | +| `UnsupportedCompressionError` | Codec lib missing | Install the extra; surface message to dev | +| `EventLogReadError` | I/O failure (wraps underlying error) | Retry or fall back | +| `EventLogIncompleteError` | Env-update event not found in scanned range | Fall back — classification not known | + +Typed exceptions (not `Optional[RuntimeInfo]`) so aether can distinguish "log is CPU" from "we couldn't tell." + +## 10. Testing + +### 10.1 Unit tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector.py` + +- Path resolver: plain file, Spark rolling dir (multi-chunk), Databricks rolling dir, multi-app dir raises, wildcard raises. +- Stream opener: each codec including missing-lib path raising. +- Event scanner: truncated log raises, malformed JSON lines skipped, `max_events_scanned` cap. +- Classifier: each of the four runtime outcomes, priority when multiple markers coexist, `spark.rapids.sql.enabled=false` override. + +### 10.2 Parity test — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector_parity.py` + +Runs `detect_spark_runtime` against existing Scala fixtures under `core/src/test/resources/spark-events-*`. Expected labels come from existing Scala test expectations (e.g., `eventlog-gpu-dsv2.zstd` → `SPARK_RAPIDS`). Fails CI if Python disagrees. + +### 10.3 Follow-up (not V1) + +A Scala-side test that synthesizes property maps, exercises each plugin, and exports a JSON fixture of `(properties, expected_runtime)` pairs. The Python parity test loads this JSON and replays classification. Catches Scala-side marker changes automatically. Deferred to a separate PR after V1 lands. + +## 11. Rollout + +- Single PR, `[FEA]` tag, references issue #2082. +- Additive only — no breaking changes. +- aether integration lands as a separate PR in `aether-services` after this merges. + +## 12. Open items for implementation plan + +- Exact location / filename of the short user-facing doc (match whatever existing convention `user_tools/docs/` uses). +- Whether the `zstandard` dep addition warrants an entry in `RELEASE.md` or similar. +- Whether we skip the `[compression]` extra in V1 and only support plain + gz + zstd (keeping V1 even smaller). Current plan: ship the extra but make it opt-in. \ No newline at end of file From 2877e6f26e228534aef70609cb717406c46fd4c2 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:10:24 -0700 Subject: [PATCH 02/29] docs(spec): address review findings on event log runtime detector spec - Extend scanner scope to include SparkListenerJobStart and SparkListenerSQLExecutionStart events, merging job-level properties and modifiedConfigs so Python classification stays in sync with the Scala pipeline (closes divergence where spark.plugins / Databricks tags are set after env-update). - Specify Databricks rolling-dir file ordering: mirror Scala's date parse with bare `eventlog` treated as latest and earliest chunk read first (the one carrying ApplicationStart). - Correct storage API references: CspFs.list_all_files / CspPath.open_input_stream. - Add `.lzf` to supported codecs (matches Scala and existing on-prem docs); use correct distribution name `spark-rapids-user-tools` and flag the `[compression]` extra as a new addition introduced by this feature. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- ...-04-22-eventlog-runtime-detector-design.md | 77 ++++++++++++++----- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md index ff3bf127a..f44f45558 100644 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -65,25 +65,26 @@ Enum string values match the Scala `SparkRuntime` enum exactly so aether's exist | Input shape | Supported | Notes | | --- | --- | --- | | Single file (plain or compressed) | ✅ | Primary case | -| Spark native rolling dir (`eventlog_v2_*`) | ✅ | Reads first `events_1_*` chunk only | -| Databricks rolling dir (`eventlog`, `eventlog-*` files) | ✅ | Reads first file | +| Spark native rolling dir (`eventlog_v2_*`) | ✅ | Reads first (earliest) `events_1_*` chunk — it holds `ApplicationStart` + `EnvironmentUpdate` | +| Databricks rolling dir (`eventlog`, `eventlog-*` files) | ✅ | Sort mirroring Scala's `DatabricksRollingEventLogFilesFileReader` (see 7.1), read the earliest | | Generic directory of independent logs | ❌ | Raises `UnsupportedEventLogShapeError` | | Wildcard path | ❌ | Same | | Comma-separated list | ❌ | Same | Rationale: aether's `is_multi_event_log_input()` already distinguishes single-app from multi-app inputs. Pre-flight detection is most useful for the single-app case; multi-app inputs don't bind to a single `aether_job_id` anyway. -Compression codecs: +Compression codecs (Spark-native set per `EventLogPathProcessor.SPARK_SHORT_COMPRESSION_CODEC_NAMES` — `lz4, lzf, snappy, zstd`, plus `gz`): | Extension | Dependency | Availability | | --- | --- | --- | | none, `.inprogress` | stdlib | always | | `.gz` | stdlib `gzip` | always | -| `.zstd`, `.zst` | `zstandard` | **non-optional new dep** — existing test fixtures are zstd, common Spark default | -| `.lz4` | `lz4` | optional extra `spark-rapids-tools[compression]` | -| `.snappy` | `cramjam` | optional extra `spark-rapids-tools[compression]` | +| `.zstd`, `.zst` | `zstandard` | **non-optional new dep** — existing test fixtures are zstd; default in several Spark deployments | +| `.lz4` | `lz4` | optional extra `spark-rapids-user-tools[compression]` (new extra, introduced in this PR) | +| `.lzf` | `python-lzf` (or equivalent — plan will pin the choice) | optional extra `spark-rapids-user-tools[compression]` | +| `.snappy` | `cramjam` | optional extra `spark-rapids-user-tools[compression]` | -Missing optional codec lib → `UnsupportedCompressionError` with actionable install message. +Missing optional codec lib → `UnsupportedCompressionError` with actionable install message pointing at the correct PyPI name `spark-rapids-user-tools[compression]`. The `[compression]` extra is introduced by this feature; it does not exist in `user_tools/pyproject.toml` today. ## 7. Module structure @@ -94,9 +95,11 @@ Four small, independently testable components inside `eventlog_detector.py` (plu Path resolver. Turns user input into the concrete file to read. - File input → return as-is. -- Directory input → pattern match against Spark-native and Databricks rolling layouts; pick first chunk. -- Anything else → raise `UnsupportedEventLogShapeError`. -- Pattern matching uses `BoundedCspPath.list_dir()` semantics; no file reads for the shape decision. +- Directory input → pattern match against rolling layouts: + - **Spark native** (`eventlog_v2_*` directory): list children via `CspFs.list_all_files(dir_path)`; pick the lexicographically earliest file whose name matches `events_1_*` (chunk index 1, which carries `ApplicationStart` + `EnvironmentUpdate`). + - **Databricks** (files named `eventlog` or `eventlog---[.codec]`): list children via `CspFs.list_all_files(dir_path)`; sort using the same rule as Scala's `DatabricksRollingEventLogFilesFileReader` (`core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:458-478, 496-500`) — parsed `LocalDateTime` from the filename, with bare `eventlog` treated as *latest* (i.e., sorted last); pick the earliest. That is the file carrying app-start. +- Anything else (generic multi-app dir, wildcard, comma list) → raise `UnsupportedEventLogShapeError`. +- Pattern matching only — no file reads for the shape decision. ### 7.2 `_open_event_log_stream(resolved_path) -> Iterator[str]` @@ -104,17 +107,44 @@ Stream opener. Opens the file with the right codec and yields decoded text lines - Codec chosen by extension. - Codec libs imported lazily inside this function — missing lib raises `UnsupportedCompressionError`, does not fail the module import. -- Cloud paths handled via `BoundedCspPath.open()`; codec streams layered on top. +- Cloud paths handled via `CspPath.open_input_stream()`; the returned stream is wrapped by the codec reader, then by a text decoder yielding lines. ### 7.3 `_scan_events(lines, max_events) -> _ScanResult` -Event scanner. Parses lines as JSON, collects three events, stops. +Event scanner. Parses lines as JSON, merges properties from the events Scala uses for runtime classification, and stops when enough state is accumulated (or the cap is reached). -- Tracks: `SparkListenerLogStart` (→ Spark version), `SparkListenerApplicationStart` (→ appId, appName), `SparkListenerEnvironmentUpdate` (→ `Spark Properties` dict). -- Stops as soon as `SparkListenerEnvironmentUpdate` is seen AND either `SparkListenerApplicationStart` is seen or the env-update came after it (i.e., we have both). In normal Spark logs both land within the first ~20 events. -- Skips malformed JSON lines (Spark tolerates trailing partial lines in live logs). -- Hard cap at `max_events_scanned` (default 1000). Purely defensive. -- On EOF or cap with env-update seen: returns `_ScanResult` populated with whatever was found (app-start may be `None`). On EOF or cap with env-update NOT seen: raises `EventLogIncompleteError` — classification is unknown. +**Events the scanner must consume** (all of these affect runtime classification in Scala): + +| Event | Scala handler | Effect on classification state | +| --- | --- | --- | +| `SparkListenerLogStart` | `EventProcessorBase.doSparkListenerLogStart` → `handleLogStartForCachedProps` | Captures Spark version | +| `SparkListenerApplicationStart` | populates `appMetaData` | Captures appId / appName | +| `SparkListenerEnvironmentUpdate` | `handleEnvUpdateForCachedProps` → `updatePredicatesFromSparkProperties` → `reEvaluate(sparkProperties)` + `gpuMode ||= isPluginEnabled(...)` | Seeds the main `Spark Properties` dict; initial plugin evaluation | +| `SparkListenerJobStart` | `handleJobStartForCachedProps` → `reEvaluateOnJobLevel(jobProperties)` | Re-evaluates plugins with `hasJobLevelConfigs=true` (DB, Iceberg, Hive). Only matters for Photon/DB here (Auron and SPARK_RAPIDS are driver-level and already settled) | +| `SparkListenerSQLExecutionStart` | `doSparkListenerSQLExecutionStart` → `mergeModifiedConfigs(modifiedConfigs)` → `updatePredicatesFromSparkProperties` | Merges per-SQL config overrides into `sparkProperties`; can turn on gpuMode or a plugin whose key was not present at env-update time | + +Scanning only `SparkListenerEnvironmentUpdate` would diverge from Scala for logs that enable `spark.plugins` / `spark.rapids.sql.enabled` / Databricks tags via `modifiedConfigs` or job-level properties. We consume all five events above. + +**Property merging (mirrors Scala):** + +- `spark_properties` = env-update `Spark Properties` with per-SQL `modifiedConfigs` merged last-write-wins (matches `CacheablePropsHandler.mergeModifiedConfigs`). +- For DB plugin specifically (`hasJobLevelConfigs=true`): also feed `SparkListenerJobStart.properties` to the DB precondition check. All three DB tag keys must be non-empty from the combined property pool (env-update ∪ job-level ∪ modifiedConfigs). +- Classification after each merge follows the exact rules in section 8. + +**Sticky semantics:** plugin matches are sticky (once true, stay true — `AppPropPlugTrait:66-68`). Same for gpuMode. So classification can only become more specific over time (SPARK → PHOTON/AURON/SPARK_RAPIDS), never less. + +**Stop conditions:** + +- Early-stop: as soon as `SparkListenerEnvironmentUpdate` has been seen AND the classification is non-SPARK, we can return. A non-SPARK label is sticky. +- Extended-scan: if classification is still SPARK after env-update, keep scanning `SparkListenerJobStart` and `SparkListenerSQLExecutionStart` events, merging their properties, re-classifying after each merge. Stop at first non-SPARK classification or at `max_events_scanned`. +- Cap: `max_events_scanned` default raised from the original 1000 to **2000** to accommodate SQLExecutionStart events that can land later in the log. `SparkListenerApplicationStart` is also captured if/when seen during the extended scan. + +**Malformed input:** lines that aren't valid JSON are skipped (Spark tolerates trailing partial lines in live logs). + +**Terminal states:** + +- EOF or cap with env-update seen → returns `_ScanResult` with the final classification state. If classification is still SPARK here, that is the final answer (matches what Scala would emit given the same prefix). +- EOF or cap with env-update NOT seen → raises `EventLogIncompleteError`. Classification is unknown and the caller should fall back to the full pipeline. ### 7.4 `_classify_runtime(spark_properties) -> SparkRuntime` @@ -195,4 +225,15 @@ A Scala-side test that synthesizes property maps, exercises each plugin, and exp - Exact location / filename of the short user-facing doc (match whatever existing convention `user_tools/docs/` uses). - Whether the `zstandard` dep addition warrants an entry in `RELEASE.md` or similar. -- Whether we skip the `[compression]` extra in V1 and only support plain + gz + zstd (keeping V1 even smaller). Current plan: ship the extra but make it opt-in. \ No newline at end of file +- Final pick for `.lzf` codec package (`python-lzf` vs alternative); the plan should verify it installs cleanly alongside current `user_tools/pyproject.toml` constraints before committing. +- Whether we keep `.snappy` / `.lzf` behind the `[compression]` extra (current plan) or fold them in as hard deps (simpler but bigger install). +- Parity-test fixture inventory: enumerate every file under `core/src/test/resources/spark-events-*` and record the expected `SparkRuntime` label, derived from existing Scala test expectations. The plan step owns this list. + +## 13. Review feedback addressed (2026-04-22) + +Findings applied against the initial draft of this spec: + +1. **Scan scope extended beyond env-update** (section 7.3). The scanner now also consumes `SparkListenerJobStart` and `SparkListenerSQLExecutionStart` and merges their property sets into classification, mirroring `EventProcessorBase.doSparkListenerSQLExecutionStart` / `handleJobStartForCachedProps` and `CacheablePropsHandler.mergeModifiedConfigs`. This closes the gap where a log enables `spark.plugins` via `modifiedConfigs` or Databricks tags via job-level properties. +2. **Databricks rolling-dir file selection** (sections 6 and 7.1). File pick is now an explicit mirror of `DatabricksRollingEventLogFilesFileReader`: parse `--YYYY-MM-DD--HH-MM` from each filename, treat bare `eventlog` as latest (sort last), pick the earliest. That is the file that carries app-start. +3. **Storage API corrected** (section 7.1, 7.2). Uses `CspFs.list_all_files(path)` and `CspPath.open_input_stream()`. `BoundedCspPath.list_dir()` / `.open()` do not exist. +4. **Compression set and packaging fixed** (section 6). `.lzf` is added alongside `.lz4`/`.snappy`/`.zstd` to match Scala's `SPARK_SHORT_COMPRESSION_CODEC_NAMES` and the existing `user_tools/docs/user-tools-onprem.md` claim. Install guidance uses the correct distribution name `spark-rapids-user-tools` and the `[compression]` extra is explicitly called out as a new extra introduced by this feature. \ No newline at end of file From 3eb4564c0b6479066cef2a69d42a4b5fccf1f453 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:17:31 -0700 Subject: [PATCH 03/29] docs(spec): reframe event log detector as best-effort early routing Incorporate second review pass: the previous spec drifted toward exact Scala parity and accumulated scope (rolling dirs, five compression codecs, four-way taxonomy as primary output). Pull it back to what the issue actually asked for. - Primary contract is now a Route enum (QUALIFICATION / PROFILING / UNKNOWN); spark_runtime demotes to auxiliary metadata. - Framed explicitly as best-effort early routing, not exact parity. Inconclusive logs return UNKNOWN; caller falls back to the full tool. - V1 input shape reduced to single-file and Databricks rolling dir (kept because aether requires it). Spark-native rolling-dir support is now out of scope. - V1 codecs reduced to plain / gz / zstd. No new `[compression]` extra. - Scanner still consumes SQLExecutionStart.modifiedConfigs (covers logs that enable spark.plugins late) but intentionally skips the narrow JobStart job-level plugin re-evaluation path, documented as a deliberate divergence. - Errors condensed to three types; "no env-update seen" becomes Route.UNKNOWN instead of an exception. - Section 13 records the evolution so reviewers can see what was pulled in and what was pulled back out. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- ...-04-22-eventlog-runtime-detector-design.md | 223 +++++++++--------- 1 file changed, 113 insertions(+), 110 deletions(-) diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md index f44f45558..48a22d1bd 100644 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -7,23 +7,26 @@ ## 1. Problem -Today, determining whether a Spark event log came from a CPU or GPU run requires running the full qualification or profiling tool. That is expensive and wrong way to use the tools for the pre-flight decision of *which* tool to run. +Determining whether a Spark event log came from a CPU or GPU run currently requires running the full qualification or profiling tool. That is expensive, and it is the wrong way to make the pre-flight decision of *which* tool to run. -aether (a consumer of spark-rapids-tools) needs to make this decision per job before invoking either tool. Its current workaround is running one tool, reading `application_information.csv`'s `sparkRuntime` column, and picking the right tool — meaning every detection costs a full qualification run. +aether (a consumer of spark-rapids-tools) needs that decision per job. Its current workaround is running one tool, reading `application_information.csv`'s `sparkRuntime`, and picking the right tool next — meaning every routing decision costs a full qualification run. ## 2. Goal -Expose a lightweight Python function that inspects the first handful of events in a Spark event log and returns the runtime classification (`SPARK`, `SPARK_RAPIDS`, `PHOTON`, `AURON`) without invoking the Scala tools pipeline. +Provide a lightweight Python function that reads a bounded prefix of a Spark event log and returns a **routing decision** (`QUALIFICATION`, `PROFILING`, or `UNKNOWN`) plus best-effort metadata (`spark_runtime`, `app_id`, `spark_version`). + +**This is best-effort early routing, not exact Scala parity.** On inconclusive input, the caller falls back to the full tool. ## 3. Non-goals (V1) -- Replacing the Scala event log reader. The Scala reader handles multi-app directories, wildcards, comma-separated lists, malformed logs, and every CSP quirk. The Python detector is a deliberately narrow shortcut for the common single-app case. -- Public user-facing CLI. This is a dev/library-scope feature, not part of the supported `spark_rapids_user_tools` CLI surface. +- Replacing the Scala event log reader. Multi-app directories, wildcards, comma lists, malformed logs, the full codec set, and most CSP quirks stay in Scala. +- Exact classification parity with Scala. Scala can adjust runtime state from events far into the log (job-level plugin re-evaluation, SQL-level `modifiedConfigs`, per-SQL merges). The Python detector only looks at a bounded prefix; the caller must be comfortable with `UNKNOWN` for genuinely ambiguous inputs. +- Public user-facing CLI. Dev/library scope only. - Writing any output files. Result is a Python object returned in-process. ## 4. Consumers -- **Primary:** aether (`aether-services`). aether already consumes `spark_rapids_tools.cmdli.tools_cli.ToolsCLI` as a library. It will import `detect_spark_runtime` directly, call it before deciding to run qualification or profiling, and branch on the `spark_runtime` value — which matches the strings it already stores in `JobRun.spark_runtime`. +- **Primary:** aether (`aether-services`). aether already consumes `spark_rapids_tools` as a library. It will import `detect_spark_runtime` and branch on the returned `Route` before deciding to run qualification or profiling. On `Route.UNKNOWN` or any raised error, aether falls back to running qualification (its current default). - **Secondary:** internal scripting / debugging. Devs can `python -c "..."` to inspect a log. ## 5. Public API @@ -34,7 +37,12 @@ Module: `user_tools/src/spark_rapids_tools/tools/eventlog_detector.py`. from enum import Enum from dataclasses import dataclass from typing import Optional, Union -from spark_rapids_tools.storagelib.cspfs import BoundedCspPath +from spark_rapids_tools.storagelib.csppath import CspPath + +class Route(str, Enum): + QUALIFICATION = "QUALIFICATION" # CPU log → run qualification + PROFILING = "PROFILING" # GPU/Photon/Auron log → run profiling + UNKNOWN = "UNKNOWN" # not enough signal; caller falls back class SparkRuntime(str, Enum): SPARK = "SPARK" @@ -43,147 +51,141 @@ class SparkRuntime(str, Enum): AURON = "AURON" @dataclass(frozen=True) -class RuntimeInfo: - spark_runtime: SparkRuntime - app_id: Optional[str] # None if SparkListenerApplicationStart not seen - app_name: Optional[str] - spark_version: Optional[str] # from SparkListenerLogStart - event_log_path: str # concrete file actually read - source_path: str # caller input (preserves rolling-dir input) +class DetectionResult: + route: Route + spark_runtime: Optional[SparkRuntime] # best-effort; may be None when route is UNKNOWN + app_id: Optional[str] + spark_version: Optional[str] + event_log_path: str # concrete file actually read + source_path: str # caller input (preserves rolling-dir input) + reason: str # short human-readable explanation def detect_spark_runtime( - event_log: Union[str, BoundedCspPath], + event_log: Union[str, CspPath], *, - max_events_scanned: int = 1000, -) -> RuntimeInfo: ... + max_events_scanned: int = 500, +) -> DetectionResult: ... ``` -Enum string values match the Scala `SparkRuntime` enum exactly so aether's existing string comparisons (`JobRun.spark_runtime != 'SPARK_RAPIDS'`) work unchanged. +`SparkRuntime` string values match the Scala enum so aether's existing `JobRun.spark_runtime` column stays compatible. ## 6. Scope of input shapes (V1) | Input shape | Supported | Notes | | --- | --- | --- | -| Single file (plain or compressed) | ✅ | Primary case | -| Spark native rolling dir (`eventlog_v2_*`) | ✅ | Reads first (earliest) `events_1_*` chunk — it holds `ApplicationStart` + `EnvironmentUpdate` | -| Databricks rolling dir (`eventlog`, `eventlog-*` files) | ✅ | Sort mirroring Scala's `DatabricksRollingEventLogFilesFileReader` (see 7.1), read the earliest | -| Generic directory of independent logs | ❌ | Raises `UnsupportedEventLogShapeError` | +| Single event log file (plain or compressed) | ✅ | Primary case | +| Databricks rolling dir (`eventlog` + optional `eventlog-` files) | ✅ | Required by aether — see 7.1 for the selection rule | +| Spark native rolling dir (`eventlog_v2_*`) | ❌ | Out of scope for V1. Raises `UnsupportedInputError`. Caller falls back. | +| Generic directory of independent logs | ❌ | Same | | Wildcard path | ❌ | Same | | Comma-separated list | ❌ | Same | -Rationale: aether's `is_multi_event_log_input()` already distinguishes single-app from multi-app inputs. Pre-flight detection is most useful for the single-app case; multi-app inputs don't bind to a single `aether_job_id` anyway. - -Compression codecs (Spark-native set per `EventLogPathProcessor.SPARK_SHORT_COMPRESSION_CODEC_NAMES` — `lz4, lzf, snappy, zstd`, plus `gz`): - -| Extension | Dependency | Availability | -| --- | --- | --- | -| none, `.inprogress` | stdlib | always | -| `.gz` | stdlib `gzip` | always | -| `.zstd`, `.zst` | `zstandard` | **non-optional new dep** — existing test fixtures are zstd; default in several Spark deployments | -| `.lz4` | `lz4` | optional extra `spark-rapids-user-tools[compression]` (new extra, introduced in this PR) | -| `.lzf` | `python-lzf` (or equivalent — plan will pin the choice) | optional extra `spark-rapids-user-tools[compression]` | -| `.snappy` | `cramjam` | optional extra `spark-rapids-user-tools[compression]` | +**Supported compression codecs:** plain, `.gz`, `.zstd`/`.zst`. Anything else (`.lz4`, `.lzf`, `.snappy`, unknown) raises `UnsupportedCompressionError` → caller falls back. -Missing optional codec lib → `UnsupportedCompressionError` with actionable install message pointing at the correct PyPI name `spark-rapids-user-tools[compression]`. The `[compression]` extra is introduced by this feature; it does not exist in `user_tools/pyproject.toml` today. +`zstandard` becomes a required dependency (existing test fixtures use `.zstd`; the common Spark default). No new optional extras. ## 7. Module structure -Four small, independently testable components inside `eventlog_detector.py` (plus a markers file): +Four small, independently testable components inside `eventlog_detector.py` (plus a markers file). ### 7.1 `_resolve_event_log_file(path) -> (source, resolved)` Path resolver. Turns user input into the concrete file to read. - File input → return as-is. -- Directory input → pattern match against rolling layouts: - - **Spark native** (`eventlog_v2_*` directory): list children via `CspFs.list_all_files(dir_path)`; pick the lexicographically earliest file whose name matches `events_1_*` (chunk index 1, which carries `ApplicationStart` + `EnvironmentUpdate`). - - **Databricks** (files named `eventlog` or `eventlog---[.codec]`): list children via `CspFs.list_all_files(dir_path)`; sort using the same rule as Scala's `DatabricksRollingEventLogFilesFileReader` (`core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:458-478, 496-500`) — parsed `LocalDateTime` from the filename, with bare `eventlog` treated as *latest* (i.e., sorted last); pick the earliest. That is the file carrying app-start. -- Anything else (generic multi-app dir, wildcard, comma list) → raise `UnsupportedEventLogShapeError`. +- Directory input → Databricks-rolling shape only: + - Use `CspFs.list_all_files(dir_path)` to list children. + - Recognize Databricks files by the same prefix Scala uses in `EventLogPathProcessor.isDBEventLogFile` (`DB_EVENT_LOG_FILE_NAME_PREFIX = "eventlog"`). + - Sort them exactly like Scala's `DatabricksRollingEventLogFilesFileReader` (`core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:458-478, 496-500`): parse `LocalDateTime` from the `eventlog-YYYY-MM-DD--HH-MM[.codec]` pattern; bare `eventlog` (no `--date`) is treated as the latest and sorted last. + - Pick the earliest file. `SparkListenerApplicationStart` and `SparkListenerEnvironmentUpdate` live there. + - If the directory contains no Databricks-pattern files → raise `UnsupportedInputError`. +- Anything else (Spark-native rolling, generic multi-app dir, wildcard, comma list) → raise `UnsupportedInputError`. - Pattern matching only — no file reads for the shape decision. ### 7.2 `_open_event_log_stream(resolved_path) -> Iterator[str]` Stream opener. Opens the file with the right codec and yields decoded text lines. -- Codec chosen by extension. -- Codec libs imported lazily inside this function — missing lib raises `UnsupportedCompressionError`, does not fail the module import. -- Cloud paths handled via `CspPath.open_input_stream()`; the returned stream is wrapped by the codec reader, then by a text decoder yielding lines. +- Codec chosen by extension: plain / `.inprogress` / `.gz` / `.zstd` / `.zst`. Anything else → `UnsupportedCompressionError`. +- Cloud paths use `CspPath.open_input_stream()`; the byte stream is wrapped by the codec reader, then by a text decoder yielding lines. ### 7.3 `_scan_events(lines, max_events) -> _ScanResult` -Event scanner. Parses lines as JSON, merges properties from the events Scala uses for runtime classification, and stops when enough state is accumulated (or the cap is reached). - -**Events the scanner must consume** (all of these affect runtime classification in Scala): - -| Event | Scala handler | Effect on classification state | -| --- | --- | --- | -| `SparkListenerLogStart` | `EventProcessorBase.doSparkListenerLogStart` → `handleLogStartForCachedProps` | Captures Spark version | -| `SparkListenerApplicationStart` | populates `appMetaData` | Captures appId / appName | -| `SparkListenerEnvironmentUpdate` | `handleEnvUpdateForCachedProps` → `updatePredicatesFromSparkProperties` → `reEvaluate(sparkProperties)` + `gpuMode ||= isPluginEnabled(...)` | Seeds the main `Spark Properties` dict; initial plugin evaluation | -| `SparkListenerJobStart` | `handleJobStartForCachedProps` → `reEvaluateOnJobLevel(jobProperties)` | Re-evaluates plugins with `hasJobLevelConfigs=true` (DB, Iceberg, Hive). Only matters for Photon/DB here (Auron and SPARK_RAPIDS are driver-level and already settled) | -| `SparkListenerSQLExecutionStart` | `doSparkListenerSQLExecutionStart` → `mergeModifiedConfigs(modifiedConfigs)` → `updatePredicatesFromSparkProperties` | Merges per-SQL config overrides into `sparkProperties`; can turn on gpuMode or a plugin whose key was not present at env-update time | - -Scanning only `SparkListenerEnvironmentUpdate` would diverge from Scala for logs that enable `spark.plugins` / `spark.rapids.sql.enabled` / Databricks tags via `modifiedConfigs` or job-level properties. We consume all five events above. +Event scanner. Parses lines as JSON and accumulates classification-relevant properties. -**Property merging (mirrors Scala):** +**Events consumed:** -- `spark_properties` = env-update `Spark Properties` with per-SQL `modifiedConfigs` merged last-write-wins (matches `CacheablePropsHandler.mergeModifiedConfigs`). -- For DB plugin specifically (`hasJobLevelConfigs=true`): also feed `SparkListenerJobStart.properties` to the DB precondition check. All three DB tag keys must be non-empty from the combined property pool (env-update ∪ job-level ∪ modifiedConfigs). -- Classification after each merge follows the exact rules in section 8. +| Event | What it contributes | +| --- | --- | +| `SparkListenerLogStart` | Spark version | +| `SparkListenerApplicationStart` | appId, appName | +| `SparkListenerEnvironmentUpdate` | Seeds `spark_properties` from its `Spark Properties` section | +| `SparkListenerSQLExecutionStart` | Merges `modifiedConfigs` into `spark_properties` (last-write-wins) — covers logs where `spark.plugins` / `spark.rapids.sql.enabled` are set per-SQL rather than at startup | -**Sticky semantics:** plugin matches are sticky (once true, stay true — `AppPropPlugTrait:66-68`). Same for gpuMode. So classification can only become more specific over time (SPARK → PHOTON/AURON/SPARK_RAPIDS), never less. +We intentionally do not track `SparkListenerJobStart` job-level properties. That would catch the DB plugin's job-level re-evaluation path, but that is a narrow Scala case and pushing further into the log moves us away from "lightweight." If a log is truly Databricks-only-detectable at job-start time, the caller's fallback path handles it. **Stop conditions:** -- Early-stop: as soon as `SparkListenerEnvironmentUpdate` has been seen AND the classification is non-SPARK, we can return. A non-SPARK label is sticky. -- Extended-scan: if classification is still SPARK after env-update, keep scanning `SparkListenerJobStart` and `SparkListenerSQLExecutionStart` events, merging their properties, re-classifying after each merge. Stop at first non-SPARK classification or at `max_events_scanned`. -- Cap: `max_events_scanned` default raised from the original 1000 to **2000** to accommodate SQLExecutionStart events that can land later in the log. `SparkListenerApplicationStart` is also captured if/when seen during the extended scan. - -**Malformed input:** lines that aren't valid JSON are skipped (Spark tolerates trailing partial lines in live logs). +- Early-stop: as soon as `_classify_runtime(spark_properties)` returns anything other than `SPARK` (the signal is decisive and Scala's plugins are sticky-true — once set, they stay set). +- EOF or `max_events_scanned` reached: return whatever was accumulated. +- `max_events_scanned` default `500`. Startup events land in the first ~20; the rest is headroom for the first few `SQLExecutionStart` merges. -**Terminal states:** +**Malformed input:** lines that aren't valid JSON are skipped. -- EOF or cap with env-update seen → returns `_ScanResult` with the final classification state. If classification is still SPARK here, that is the final answer (matches what Scala would emit given the same prefix). -- EOF or cap with env-update NOT seen → raises `EventLogIncompleteError`. Classification is unknown and the caller should fall back to the full pipeline. +**Returned state:** `(spark_properties, app_id, app_name, spark_version, env_update_seen)`. ### 7.4 `_classify_runtime(spark_properties) -> SparkRuntime` -Runtime classifier. Pure function over the properties dict. See section 8 for the exact rules. +Pure function over the accumulated properties dict. See section 8 for rules. + +### 7.5 Top-level flow + +``` +detect_spark_runtime(path): + source, resolved = _resolve_event_log_file(path) + with _open_event_log_stream(resolved) as lines: + scan = _scan_events(lines, max_events_scanned) + + if not scan.env_update_seen: + return DetectionResult(route=UNKNOWN, spark_runtime=None, ..., + reason="no SparkListenerEnvironmentUpdate before cap") + + runtime = _classify_runtime(scan.spark_properties) + route = PROFILING if runtime in {SPARK_RAPIDS, PHOTON, AURON} else QUALIFICATION + return DetectionResult(route=route, spark_runtime=runtime, ..., + reason=f"classified as {runtime.value}") +``` -### 7.5 `eventlog_detector_markers.py` +### 7.6 `eventlog_detector_markers.py` -Single source of truth for keys/regex/substrings. Every constant has a `# Scala source: :` comment next to it. +Single source of truth for keys/regex/substrings. Every constant carries a `# Scala source: :` comment. -## 8. Classification rules (synced with Scala) +## 8. Classification rules -Priority order evaluated in Python: **PHOTON > AURON > SPARK_RAPIDS > SPARK**. This is a deliberate, deterministic Python choice; Scala's `pluginMap.values.filter(...).find(...)` iterates a `HashMap` and is non-deterministic when multiple plugins match (they don't overlap in practice). +Mapped from the same Scala sources. Priority order in Python: **PHOTON > AURON > SPARK_RAPIDS > SPARK**. -**SPARK_RAPIDS** — `ToolUtils.isPluginEnabled` (`core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121`): +**SPARK_RAPIDS** — from `ToolUtils.isPluginEnabled` (`core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121`): - `spark.plugins` contains substring `com.nvidia.spark.SQLPlugin` -- AND `spark.rapids.sql.enabled` parses as boolean true (default true if missing or unparseable) +- AND `spark.rapids.sql.enabled` parses as boolean true (default true if missing/unparseable) -**AURON** — `AuronParseHelper.eval` (`core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172`): +**AURON** — from `AuronParseHelper.eval` (`core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172`): -- `spark.sql.extensions` fullmatches regex `.*AuronSparkSessionExtension.*` -- AND `spark.auron.enabled` trimmed equals `"true"` case-insensitively (default `"true"` if missing) +- `spark.sql.extensions` fullmatches `.*AuronSparkSessionExtension.*` +- AND `spark.auron.enabled` trimmed equals `"true"` case-insensitively (default `"true"`) -**PHOTON** — requires Databricks precondition + Photon marker: +**PHOTON** — combined precondition + marker: - Databricks precondition (`DBConditionImpl.eval`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58`): all three of `spark.databricks.clusterUsageTags.clusterAllTags`, `.clusterId`, `.clusterName` are non-empty. -- AND Photon marker (`PhotonParseHelper.extensionRegxMap`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151`): any one of these fullmatches: +- AND any Photon marker (`PhotonParseHelper`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151`) fullmatches: - `spark.databricks.clusterUsageTags.sparkVersion` ~ `.*-photon-.*` - `spark.databricks.clusterUsageTags.effectiveSparkVersion` ~ `.*-photon-.*` - `spark.databricks.clusterUsageTags.sparkImageLabel` ~ `.*-photon-.*` - `spark.databricks.clusterUsageTags.runtimeEngine` ~ `PHOTON` -**SPARK** — none of the above. +**SPARK** — none of the above matched on the accumulated properties. -### Notes on fidelity - -- Scala's `String.matches(regex)` requires whole-string match → Python uses `re.fullmatch`. -- Plugin-based runtime beats gpuMode: if a log has both `com.nvidia.spark.SQLPlugin` and Photon markers, Scala returns `PHOTON`. Python priority preserves this. -- `spark.rapids.sql.enabled` default-to-true semantics match `ToolUtils.isPluginEnabled` exactly (`Try { ... }.getOrElse(true)`). +Python uses `re.fullmatch` (matches Scala's `String.matches` semantics). ## 9. Error model @@ -191,29 +193,31 @@ All errors subclass `EventLogDetectionError`: | Exception | Meaning | aether action | | --- | --- | --- | -| `UnsupportedEventLogShapeError` | Input is multi-app / wildcard / comma list | Fall back to running the full tool | -| `UnsupportedCompressionError` | Codec lib missing | Install the extra; surface message to dev | -| `EventLogReadError` | I/O failure (wraps underlying error) | Retry or fall back | -| `EventLogIncompleteError` | Env-update event not found in scanned range | Fall back — classification not known | +| `UnsupportedInputError` | Input shape not supported (Spark-native rolling, multi-app dir, wildcard, comma list, empty dir) | Fall back to running the full tool | +| `UnsupportedCompressionError` | Codec outside the supported set | Fall back | +| `EventLogReadError` | I/O failure (wraps underlying error) | Fall back | -Typed exceptions (not `Optional[RuntimeInfo]`) so aether can distinguish "log is CPU" from "we couldn't tell." +**Note:** "scanner never saw env-update" is **not** an exception — it's a `DetectionResult` with `route=UNKNOWN`. This keeps the caller's happy path free of exception handling for the common "inconclusive log" case. ## 10. Testing ### 10.1 Unit tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector.py` -- Path resolver: plain file, Spark rolling dir (multi-chunk), Databricks rolling dir, multi-app dir raises, wildcard raises. -- Stream opener: each codec including missing-lib path raising. -- Event scanner: truncated log raises, malformed JSON lines skipped, `max_events_scanned` cap. +- Path resolver: plain file; Databricks rolling dir (multi-file with dated + bare `eventlog`, asserting earliest picked); Spark-native rolling dir raises; multi-app dir raises; wildcard raises. +- Stream opener: plain / gz / zstd each works; `.lz4`/`.snappy` raises. +- Event scanner: env-update only → classifies from it; env-update + later SQLExecutionStart that sets `spark.plugins` → classification updates to `SPARK_RAPIDS`; no env-update within cap → `UNKNOWN`; malformed JSON lines skipped. - Classifier: each of the four runtime outcomes, priority when multiple markers coexist, `spark.rapids.sql.enabled=false` override. +- Routing: runtime → route mapping. -### 10.2 Parity test — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector_parity.py` +### 10.2 Fixture tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector_fixtures.py` -Runs `detect_spark_runtime` against existing Scala fixtures under `core/src/test/resources/spark-events-*`. Expected labels come from existing Scala test expectations (e.g., `eventlog-gpu-dsv2.zstd` → `SPARK_RAPIDS`). Fails CI if Python disagrees. +Runs `detect_spark_runtime` against a small curated set of existing fixtures under `core/src/test/resources/spark-events-*`: -### 10.3 Follow-up (not V1) +- `eventlog-gpu-dsv2.zstd` → `Route.PROFILING`, `SPARK_RAPIDS` +- `eventlog_dsv2.zstd` → `Route.QUALIFICATION`, `SPARK` +- A Databricks rolling fixture (to be identified during plan; if none exists we'll synthesize one) -A Scala-side test that synthesizes property maps, exercises each plugin, and exports a JSON fixture of `(properties, expected_runtime)` pairs. The Python parity test loads this JSON and replays classification. Catches Scala-side marker changes automatically. Deferred to a separate PR after V1 lands. +Not a full parity sweep — just anchor points to catch regressions. ## 11. Rollout @@ -223,17 +227,16 @@ A Scala-side test that synthesizes property maps, exercises each plugin, and exp ## 12. Open items for implementation plan -- Exact location / filename of the short user-facing doc (match whatever existing convention `user_tools/docs/` uses). -- Whether the `zstandard` dep addition warrants an entry in `RELEASE.md` or similar. -- Final pick for `.lzf` codec package (`python-lzf` vs alternative); the plan should verify it installs cleanly alongside current `user_tools/pyproject.toml` constraints before committing. -- Whether we keep `.snappy` / `.lzf` behind the `[compression]` extra (current plan) or fold them in as hard deps (simpler but bigger install). -- Parity-test fixture inventory: enumerate every file under `core/src/test/resources/spark-events-*` and record the expected `SparkRuntime` label, derived from existing Scala test expectations. The plan step owns this list. +- Confirm `core/src/test/resources/` contains a usable Databricks-rolling fixture; synthesize one if not. +- Whether adding `zstandard` as a hard dep needs a `RELEASE.md` note. + +## 13. Evolution -## 13. Review feedback addressed (2026-04-22) +This spec was reshaped once after review feedback. Earlier drafts attempted: -Findings applied against the initial draft of this spec: +- Full scan-scope parity with Scala (including `SparkListenerJobStart.properties` job-level plugin re-evaluation) — dropped. Documented as a known divergence case; caller handles it via fallback on `Route.UNKNOWN`. +- Spark-native rolling-dir support — dropped from V1. Aether's primary input is single files or Databricks rolling dirs. +- `.lz4` / `.lzf` / `.snappy` codec support with a new `[compression]` extra — dropped. Added scope without matching a real need. +- 4-way `SparkRuntime` return as the primary contract — kept as auxiliary metadata; primary contract is now the `Route` enum (the actual decision the caller makes). -1. **Scan scope extended beyond env-update** (section 7.3). The scanner now also consumes `SparkListenerJobStart` and `SparkListenerSQLExecutionStart` and merges their property sets into classification, mirroring `EventProcessorBase.doSparkListenerSQLExecutionStart` / `handleJobStartForCachedProps` and `CacheablePropsHandler.mergeModifiedConfigs`. This closes the gap where a log enables `spark.plugins` via `modifiedConfigs` or Databricks tags via job-level properties. -2. **Databricks rolling-dir file selection** (sections 6 and 7.1). File pick is now an explicit mirror of `DatabricksRollingEventLogFilesFileReader`: parse `--YYYY-MM-DD--HH-MM` from each filename, treat bare `eventlog` as latest (sort last), pick the earliest. That is the file that carries app-start. -3. **Storage API corrected** (section 7.1, 7.2). Uses `CspFs.list_all_files(path)` and `CspPath.open_input_stream()`. `BoundedCspPath.list_dir()` / `.open()` do not exist. -4. **Compression set and packaging fixed** (section 6). `.lzf` is added alongside `.lz4`/`.snappy`/`.zstd` to match Scala's `SPARK_SHORT_COMPRESSION_CODEC_NAMES` and the existing `user_tools/docs/user-tools-onprem.md` claim. Install guidance uses the correct distribution name `spark-rapids-user-tools` and the `[compression]` extra is explicitly called out as a new extra introduced by this feature. \ No newline at end of file +The narrower V1 keeps the detector honest about what it is: a best-effort fast path that gets out of the way when the log doesn't give it enough signal. \ No newline at end of file From df8b53b204ba52c65f3009934e7050a92db6b083 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:23:10 -0700 Subject: [PATCH 04/29] docs(spec): tighten decision rule and Databricks-rolling scan shape Third review pass. Three targeted changes, no scope expansion. - Asymmetric decision rule: PROFILING on a decisive non-SPARK signal, QUALIFICATION only after walking the full log (EXHAUSTED) with no GPU signal, UNKNOWN when the cap is hit or env-update was never seen. Previously the spec promoted "no GPU signal in prefix" to QUALIFICATION, which is unsafe because Scala can promote runtime later via SQLExecutionStart.modifiedConfigs. - Databricks rolling dir resolves to the full ordered file list and the scanner walks all files under one shared event budget. Picking only the earliest file undercut the scanner's own mitigation, since GPU markers set via modifiedConfigs can land in later rolled files. - Stream opener is explicitly a `@contextmanager` that yields an Iterator[str] and owns closing the codec / text / byte streams. Previous signature vs usage were inconsistent. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- ...-04-22-eventlog-runtime-detector-design.md | 87 ++++++++++++------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md index 48a22d1bd..52653fda3 100644 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -17,6 +17,14 @@ Provide a lightweight Python function that reads a bounded prefix of a Spark eve **This is best-effort early routing, not exact Scala parity.** On inconclusive input, the caller falls back to the full tool. +**Asymmetric decision rule:** positive GPU evidence is decisive; absence of evidence in a bounded prefix is not. Concretely: + +- Decisive non-SPARK signal seen → `PROFILING`. +- No signal seen, but the scanner walked the whole log (or the whole ordered file list in a rolling dir) to EOF → `QUALIFICATION`. +- No signal seen and the `max_events_scanned` cap is hit first → `UNKNOWN`. + +This protects against the one dangerous failure mode (routing a GPU log to qualification output that gets fed into pipelines expecting CPU results) by never saying `QUALIFICATION` on truncated evidence. + ## 3. Non-goals (V1) - Replacing the Scala event log reader. Multi-app directories, wildcards, comma lists, malformed logs, the full codec set, and most CSP quirks stay in Scala. @@ -88,26 +96,34 @@ def detect_spark_runtime( Four small, independently testable components inside `eventlog_detector.py` (plus a markers file). -### 7.1 `_resolve_event_log_file(path) -> (source, resolved)` +### 7.1 `_resolve_event_log_files(path) -> (source, ordered_files)` -Path resolver. Turns user input into the concrete file to read. +Path resolver. Turns user input into an ordered list of one or more concrete files to read. -- File input → return as-is. +- File input → return `[file]`. - Directory input → Databricks-rolling shape only: - Use `CspFs.list_all_files(dir_path)` to list children. - Recognize Databricks files by the same prefix Scala uses in `EventLogPathProcessor.isDBEventLogFile` (`DB_EVENT_LOG_FILE_NAME_PREFIX = "eventlog"`). - Sort them exactly like Scala's `DatabricksRollingEventLogFilesFileReader` (`core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:458-478, 496-500`): parse `LocalDateTime` from the `eventlog-YYYY-MM-DD--HH-MM[.codec]` pattern; bare `eventlog` (no `--date`) is treated as the latest and sorted last. - - Pick the earliest file. `SparkListenerApplicationStart` and `SparkListenerEnvironmentUpdate` live there. + - Return the full ordered list. The scanner walks them in order under one shared event budget; earliest file carries startup events, but GPU markers that appear via later `SQLExecutionStart.modifiedConfigs` can live in subsequent files. - If the directory contains no Databricks-pattern files → raise `UnsupportedInputError`. - Anything else (Spark-native rolling, generic multi-app dir, wildcard, comma list) → raise `UnsupportedInputError`. - Pattern matching only — no file reads for the shape decision. -### 7.2 `_open_event_log_stream(resolved_path) -> Iterator[str]` +### 7.2 `_open_event_log_stream(resolved_path)` — context manager yielding `Iterator[str]` + +Stream opener. Context manager (implemented with `@contextlib.contextmanager`) that opens the file with the right codec, yields an iterator of decoded text lines, and closes the underlying stream on exit. -Stream opener. Opens the file with the right codec and yields decoded text lines. +Usage shape: + +```python +with _open_event_log_stream(resolved_path) as lines: + for line in lines: + ... +``` - Codec chosen by extension: plain / `.inprogress` / `.gz` / `.zstd` / `.zst`. Anything else → `UnsupportedCompressionError`. -- Cloud paths use `CspPath.open_input_stream()`; the byte stream is wrapped by the codec reader, then by a text decoder yielding lines. +- Cloud paths use `CspPath.open_input_stream()` (returns a closable byte stream); the byte stream is wrapped by the codec reader, then by a text decoder (`io.TextIOWrapper` or equivalent) yielding lines. The context-manager wrapper owns closing all three layers. ### 7.3 `_scan_events(lines, max_events) -> _ScanResult` @@ -124,15 +140,16 @@ Event scanner. Parses lines as JSON and accumulates classification-relevant prop We intentionally do not track `SparkListenerJobStart` job-level properties. That would catch the DB plugin's job-level re-evaluation path, but that is a narrow Scala case and pushing further into the log moves us away from "lightweight." If a log is truly Databricks-only-detectable at job-start time, the caller's fallback path handles it. -**Stop conditions:** +**Stop conditions and termination mode:** the scanner walks the ordered file list (one file for a plain input, multiple for a Databricks rolling dir) under a single shared budget `max_events_scanned`. -- Early-stop: as soon as `_classify_runtime(spark_properties)` returns anything other than `SPARK` (the signal is decisive and Scala's plugins are sticky-true — once set, they stay set). -- EOF or `max_events_scanned` reached: return whatever was accumulated. -- `max_events_scanned` default `500`. Startup events land in the first ~20; the rest is headroom for the first few `SQLExecutionStart` merges. +- Early-stop (`Termination.DECISIVE`): as soon as `_classify_runtime(spark_properties)` returns anything other than `SPARK`. The signal is decisive; plugins are sticky-true in Scala (once set, stay set — `AppPropPlugTrait:66-68`). +- Walked-to-end (`Termination.EXHAUSTED`): the final file's EOF is reached before the cap. We have seen the entire log. +- Cap-reached (`Termination.CAP_HIT`): the cap hit before exhausting the files. +- `max_events_scanned` default `500`. Startup events land in the first ~20; the rest is headroom for the first few `SQLExecutionStart` merges and any tail files. **Malformed input:** lines that aren't valid JSON are skipped. -**Returned state:** `(spark_properties, app_id, app_name, spark_version, env_update_seen)`. +**Returned state:** `(spark_properties, app_id, app_name, spark_version, env_update_seen, termination)` where `termination` is one of the three modes above. ### 7.4 `_classify_runtime(spark_properties) -> SparkRuntime` @@ -142,20 +159,25 @@ Pure function over the accumulated properties dict. See section 8 for rules. ``` detect_spark_runtime(path): - source, resolved = _resolve_event_log_file(path) - with _open_event_log_stream(resolved) as lines: - scan = _scan_events(lines, max_events_scanned) - - if not scan.env_update_seen: - return DetectionResult(route=UNKNOWN, spark_runtime=None, ..., - reason="no SparkListenerEnvironmentUpdate before cap") - - runtime = _classify_runtime(scan.spark_properties) - route = PROFILING if runtime in {SPARK_RAPIDS, PHOTON, AURON} else QUALIFICATION - return DetectionResult(route=route, spark_runtime=runtime, ..., - reason=f"classified as {runtime.value}") + source, ordered_files = _resolve_event_log_files(path) + scan = _scan_events_across(ordered_files, max_events_scanned) + + runtime = _classify_runtime(scan.spark_properties) if scan.env_update_seen else None + + # Decision rule (asymmetric — see section 2): + if runtime in {SPARK_RAPIDS, PHOTON, AURON}: + route, reason = PROFILING, f"decisive: classified as {runtime.value}" + elif scan.termination == EXHAUSTED and scan.env_update_seen: + route, reason = QUALIFICATION, "walked full log, no GPU signal" + else: + # CAP_HIT, or env-update never seen. Do not promote absence to CPU. + route, reason = UNKNOWN, "no decisive signal within bounded scan" + + return DetectionResult(route=route, spark_runtime=runtime, ..., reason=reason) ``` +`_scan_events_across(ordered_files, budget)` is the thin wrapper that opens each file (via `_open_event_log_stream`) and feeds its lines into `_scan_events` while tracking the remaining global budget. It stops and returns as soon as the scanner reports `DECISIVE`, or when the budget is exhausted, or when the last file's EOF is reached. + ### 7.6 `eventlog_detector_markers.py` Single source of truth for keys/regex/substrings. Every constant carries a `# Scala source: :` comment. @@ -203,11 +225,12 @@ All errors subclass `EventLogDetectionError`: ### 10.1 Unit tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector.py` -- Path resolver: plain file; Databricks rolling dir (multi-file with dated + bare `eventlog`, asserting earliest picked); Spark-native rolling dir raises; multi-app dir raises; wildcard raises. -- Stream opener: plain / gz / zstd each works; `.lz4`/`.snappy` raises. -- Event scanner: env-update only → classifies from it; env-update + later SQLExecutionStart that sets `spark.plugins` → classification updates to `SPARK_RAPIDS`; no env-update within cap → `UNKNOWN`; malformed JSON lines skipped. +- Path resolver: plain file → single-element list; Databricks rolling dir (multi-file with dated + bare `eventlog`) → ordered list with earliest first and bare `eventlog` last; Spark-native rolling dir raises; multi-app dir raises; wildcard raises. +- Stream opener: plain / gz / zstd each works and closes on exit; `.lz4`/`.snappy` raises. +- Event scanner: env-update only → classifies from it; env-update + later SQLExecutionStart that sets `spark.plugins` → classification updates to `SPARK_RAPIDS` and terminates DECISIVE; no env-update within cap → termination `CAP_HIT`; full-log scan with no GPU signal → termination `EXHAUSTED`; malformed JSON lines skipped. +- Multi-file scan: GPU marker in a later Databricks-rolling file → picked up under the shared budget; budget exhausted across files → `CAP_HIT`. - Classifier: each of the four runtime outcomes, priority when multiple markers coexist, `spark.rapids.sql.enabled=false` override. -- Routing: runtime → route mapping. +- Routing rule: DECISIVE + non-SPARK → `PROFILING`; EXHAUSTED with env-update + SPARK → `QUALIFICATION`; CAP_HIT → `UNKNOWN`; env-update never seen → `UNKNOWN`. ### 10.2 Fixture tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector_fixtures.py` @@ -239,4 +262,10 @@ This spec was reshaped once after review feedback. Earlier drafts attempted: - `.lz4` / `.lzf` / `.snappy` codec support with a new `[compression]` extra — dropped. Added scope without matching a real need. - 4-way `SparkRuntime` return as the primary contract — kept as auxiliary metadata; primary contract is now the `Route` enum (the actual decision the caller makes). -The narrower V1 keeps the detector honest about what it is: a best-effort fast path that gets out of the way when the log doesn't give it enough signal. \ No newline at end of file +The narrower V1 keeps the detector honest about what it is: a best-effort fast path that gets out of the way when the log doesn't give it enough signal. + +**Third review pass (2026-04-22):** + +- **Asymmetric decision rule** (sections 2, 7.5). Previously the spec promoted "no GPU signal in prefix" to `QUALIFICATION`. Under Scala's late-promotion paths (`SQLExecutionStart.modifiedConfigs`, job-level plugin re-eval), that is unsafe. The rule now requires either a decisive GPU signal (→ `PROFILING`) or a fully-walked log with no GPU signal (→ `QUALIFICATION`). Cap-hit returns `UNKNOWN`. +- **Databricks rolling dir scans the full ordered list** (section 7.1, 7.5). Picking only the earliest file contradicted the scanner's expansion to handle `modifiedConfigs` (which can land in later rolled files). The resolver now returns the ordered file list and the scanner walks it under one shared event budget. +- **Stream opener is a context manager** (section 7.2). Previous signature said `Iterator[str]` while the top-level flow used it with `with`. Clarified as a `@contextmanager` that yields the iterator and owns closing the underlying streams. \ No newline at end of file From 68889da1ddc3986b486461b74efb134d130e9d35 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:26:36 -0700 Subject: [PATCH 05/29] docs(spec): pin memory contract and frame max_events_scanned as cost cap Fourth review pass, two low-risk clarifications. - Memory contract stated explicitly at the top of section 7: streaming only, no full-file reads, no raw-event accumulation, bounded per-invocation state (scalars + one mutable spark_properties dict). Prevents an implementation from drifting into read() / readlines() / full-log buffering and silently breaking the "lightweight" promise. - max_events_scanned reframed as the primary CPU/I-O cost cap, not just an ambiguity tie-breaker. Large CPU logs hitting the cap will terminate as UNKNOWN; that is expected behavior, not a failure mode. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- ...026-04-22-eventlog-runtime-detector-design.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md index 52653fda3..12e415af5 100644 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -96,6 +96,15 @@ def detect_spark_runtime( Four small, independently testable components inside `eventlog_detector.py` (plus a markers file). +**Memory contract (applies to every component below):** + +- No full-file reads. No `read()`, no `readlines()`, no slurping a log into memory. All I/O is line-at-a-time streaming via `_open_event_log_stream`'s iterator. +- No accumulation of raw events. Each parsed JSON line is inspected, the relevant fields are merged into state, and the line is discarded before moving on. +- Retained state per invocation is bounded and small: a handful of scalars (`app_id`, `app_name`, `spark_version`, `env_update_seen`, a running event counter, a termination enum) plus one mutable `spark_properties: dict[str, str]` that grows only with env-update values and later `modifiedConfigs` merges. Nothing else is held across iterations. +- The scanner must never buffer the full list of seen events; it walks, updates state, and moves on. + +This is what the "lightweight" claim actually rests on. Any implementation change that accumulates per-event data must be reviewed against this contract. + ### 7.1 `_resolve_event_log_files(path) -> (source, ordered_files)` Path resolver. Turns user input into an ordered list of one or more concrete files to read. @@ -147,6 +156,8 @@ We intentionally do not track `SparkListenerJobStart` job-level properties. That - Cap-reached (`Termination.CAP_HIT`): the cap hit before exhausting the files. - `max_events_scanned` default `500`. Startup events land in the first ~20; the rest is headroom for the first few `SQLExecutionStart` merges and any tail files. +**What the cap is actually for:** `max_events_scanned` is the primary protection against CPU-time and I/O blowups on large logs, not just a tie-breaker for ambiguity. Big CPU logs will routinely hit the cap before EOF and therefore terminate as `CAP_HIT`, which maps to `Route.UNKNOWN`. That is intentional — the detector refuses to speculate, and the caller falls back to the full tool. Users who want to convert more of their `UNKNOWN`s to `QUALIFICATION` can raise the cap at the call site, accepting the proportional increase in cost. + **Malformed input:** lines that aren't valid JSON are skipped. **Returned state:** `(spark_properties, app_id, app_name, spark_version, env_update_seen, termination)` where `termination` is one of the three modes above. @@ -264,6 +275,11 @@ This spec was reshaped once after review feedback. Earlier drafts attempted: The narrower V1 keeps the detector honest about what it is: a best-effort fast path that gets out of the way when the log doesn't give it enough signal. +**Fourth review pass (2026-04-22):** + +- **Explicit memory contract** (section 7). Stated up front that the detector is strictly streaming with no full-file reads, no raw-event accumulation, and a bounded per-invocation state (a few scalars plus one mutable `spark_properties` dict). Closes the door on a well-meaning implementation drifting into `read()` / `readlines()` / full-log buffering. +- **Cap framing** (section 7.3). `max_events_scanned` is documented as the primary cost cap, not just an ambiguity tie-breaker. Large CPU logs intentionally end as `UNKNOWN` at cap, which is expected behavior — callers that want a higher conversion to `QUALIFICATION` can raise the cap and accept the cost. + **Third review pass (2026-04-22):** - **Asymmetric decision rule** (sections 2, 7.5). Previously the spec promoted "no GPU signal in prefix" to `QUALIFICATION`. Under Scala's late-promotion paths (`SQLExecutionStart.modifiedConfigs`, job-level plugin re-eval), that is unsafe. The rule now requires either a decisive GPU signal (→ `PROFILING`) or a fully-walked log with no GPU signal (→ `QUALIFICATION`). Cap-hit returns `UNKNOWN`. From 327f94b7c7677d0ac18dff9a81b45937b457cb75 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:34:00 -0700 Subject: [PATCH 06/29] docs(plan): event log runtime detector implementation plan Twelve bite-sized tasks with TDD flow. Covers: zstandard dep, package skeleton, types/enums/exceptions, Scala-pinned markers, classifier, codec-aware streamer, resolver (single file + Databricks rolling), bounded multi-file scanner, top-level detect_spark_runtime, fixture anchor tests, full-suite + lint run, and a final spec doc update capturing the realized fixture inventory. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- .../2026-04-22-eventlog-runtime-detector.md | 2170 +++++++++++++++++ 1 file changed, 2170 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md diff --git a/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md b/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md new file mode 100644 index 000000000..2daa0dbe1 --- /dev/null +++ b/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md @@ -0,0 +1,2170 @@ +# Event Log Runtime Detector Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Ship a lightweight Python function `detect_spark_runtime(path)` that returns a `Route` (`QUALIFICATION` / `PROFILING` / `UNKNOWN`) and best-effort runtime metadata by scanning a bounded prefix of a Spark event log — single file or Databricks rolling directory — without invoking the Scala tools pipeline. + +**Architecture:** Four cohesive units inside a single module under `user_tools/src/spark_rapids_tools/tools/`: path resolver, stream opener (context manager, codec-aware), bounded event scanner (streaming only, no accumulation), pure-function classifier. A single-source-of-truth markers file pins the key strings/regexes to Scala source references. Public surface is the function plus three dataclasses/enums (`Route`, `SparkRuntime`, `DetectionResult`) plus a small exception hierarchy. No CLI. No output files. + +**Tech Stack:** Python 3.10+ (matches `user_tools/pyproject.toml`), stdlib (`gzip`, `json`, `io`, `re`, `contextlib`, `dataclasses`, `enum`, `pathlib`, `datetime`), `zstandard` (new hard dep), `spark_rapids_tools.storagelib.csppath.CspPath` + `spark_rapids_tools.storagelib.cspfs.CspFs` for cloud/local I/O, `pytest` for tests. + +**Spec:** `docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md` + +--- + +## File Structure + +**New files (all under `user_tools/`):** + +- `src/spark_rapids_tools/tools/eventlog_detector/__init__.py` — re-exports public API. +- `src/spark_rapids_tools/tools/eventlog_detector/markers.py` — single source of truth for property keys, regexes, substrings. Each constant carries a `# Scala source: :` comment. +- `src/spark_rapids_tools/tools/eventlog_detector/types.py` — `Route`, `SparkRuntime`, `DetectionResult`, `Termination` enum, exception hierarchy. +- `src/spark_rapids_tools/tools/eventlog_detector/resolver.py` — `_resolve_event_log_files(path) -> tuple[str, list[CspPath]]`. +- `src/spark_rapids_tools/tools/eventlog_detector/stream.py` — `_open_event_log_stream(path)` context manager yielding `Iterator[str]`. +- `src/spark_rapids_tools/tools/eventlog_detector/scanner.py` — `_scan_events(lines, budget) -> _ScanResult` and `_scan_events_across(paths, budget) -> _ScanResult`. +- `src/spark_rapids_tools/tools/eventlog_detector/classifier.py` — `_classify_runtime(spark_properties) -> SparkRuntime`. +- `src/spark_rapids_tools/tools/eventlog_detector/detector.py` — top-level `detect_spark_runtime()` that ties everything together. + +**New tests (under `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/`):** + +- `__init__.py` +- `conftest.py` — shared fixtures (synthesized event log files in `tmp_path`). +- `test_markers.py` +- `test_types.py` +- `test_classifier.py` +- `test_resolver.py` +- `test_stream.py` +- `test_scanner.py` +- `test_detector.py` — integration tests covering the public API end-to-end. +- `test_detector_fixtures.py` — runs the detector against the existing `core/src/test/resources/spark-events-*` fixtures with expected `Route` labels. + +**Modified files:** + +- `user_tools/pyproject.toml` — add `zstandard` to `dependencies`. +- `user_tools/src/spark_rapids_tools/tools/__init__.py` — add re-export line so `from spark_rapids_tools.tools import detect_spark_runtime` works (if the existing `__init__.py` follows that pattern — verify in Task 2). + +**Rationale for splitting into many small files:** the spec defines four independently testable components plus a shared types/markers layer. Putting each in its own file keeps files small, lets each test file focus on one component, and makes reviewer diffs surgical. This matches existing layout in `user_tools/src/spark_rapids_tools/tools/` (several single-purpose files there already). + +--- + +## Task 1: Add `zstandard` dependency + +**Files:** + +- Modify: `user_tools/pyproject.toml` + +- [ ] **Step 1: Read current dependencies block** + +Run: `sed -n '/^dependencies = \[/,/^\]/p' user_tools/pyproject.toml` +Expected: prints the current `dependencies = [ ... ]` block. `zstandard` must not be present. + +- [ ] **Step 2: Add `zstandard` to dependencies** + +Use Edit to insert a new line after the existing `"pyYAML>=6.0.2",` line (or similar stable anchor — whichever comes last alphabetically near the Z range). Add: + +``` + # Decompresses Spark event logs written with --conf spark.eventLog.compress=true + # spark.io.compression.codec=zstd (the common default). Used by + # spark_rapids_tools.tools.eventlog_detector. + "zstandard>=0.22.0", +``` + +The exact Edit call (using a unique anchor from the current file — pick the existing `# used for retrieving available memory on the host` comment block, which precedes `"psutil==7.0.0"`): + +```python +Edit( + file_path="user_tools/pyproject.toml", + old_string=' # used for retrieving available memory on the host\n "psutil==7.0.0",', + new_string=' # used for retrieving available memory on the host\n "psutil==7.0.0",\n # Decompresses Spark event logs with zstd codec. Used by\n # spark_rapids_tools.tools.eventlog_detector.\n "zstandard>=0.22.0",' +) +``` + +- [ ] **Step 3: Install the updated package locally** + +Run: `pip install -e user_tools/` +Expected: `zstandard-` appears in the output; install completes successfully. + +- [ ] **Step 4: Verify `zstandard` is importable** + +Run: `python -c "import zstandard; print(zstandard.__version__)"` +Expected: prints a version string, no `ModuleNotFoundError`. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/pyproject.toml +git commit -m "build(user_tools): add zstandard dep for event log detector" +``` + +--- + +## Task 2: Create module skeleton and `__init__.py` + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py` +- Read (do not modify yet): `user_tools/src/spark_rapids_tools/tools/__init__.py` + +- [ ] **Step 1: Read the existing `tools/__init__.py` to learn the re-export convention** + +Run: `cat user_tools/src/spark_rapids_tools/tools/__init__.py` +Record: whether the file has explicit re-exports, `__all__`, or is empty. This determines whether Task 9 needs to append anything there. + +- [ ] **Step 2: Create the package skeleton** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Lightweight event log runtime detector. + +Public entry point: ``detect_spark_runtime``. Returns a ``DetectionResult`` +carrying a routing decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) +and best-effort metadata, by scanning a bounded prefix of a Spark event log. + +See docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +for the full contract and the Scala sources this mirrors. +""" + +from spark_rapids_tools.tools.eventlog_detector.detector import detect_spark_runtime +from spark_rapids_tools.tools.eventlog_detector.types import ( + DetectionResult, + EventLogDetectionError, + EventLogReadError, + Route, + SparkRuntime, + UnsupportedCompressionError, + UnsupportedInputError, +) + +__all__ = [ + "DetectionResult", + "EventLogDetectionError", + "EventLogReadError", + "Route", + "SparkRuntime", + "UnsupportedCompressionError", + "UnsupportedInputError", + "detect_spark_runtime", +] +``` + +This will `ImportError` until the submodules exist. That's fine; subsequent tasks fill them in. We create it first so each later task's "does it import?" smoke test covers the integration path. + +- [ ] **Step 3: Create empty test package** + +Create `user_tools/tests/spark_rapids_tools_ut/tools/__init__.py` (empty) and `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py` (empty). First check whether `tools/__init__.py` already exists: + +Run: `test -f user_tools/tests/spark_rapids_tools_ut/tools/__init__.py && echo EXISTS || echo MISSING` + +If `MISSING`, create it with the Apache 2 header as a docstring module: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for ``spark_rapids_tools.tools``.""" +``` + +Then create `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py` with the same header. + +- [ ] **Step 4: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py \ + user_tools/tests/spark_rapids_tools_ut/tools/__init__.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py +git commit -m "feat(eventlog_detector): package skeleton and public re-exports" +``` + +Note: the package re-exports will fail to resolve until Task 3/9, but the commit is self-contained as a new empty package. Do not run the imports yet. + +--- + +## Task 3: Types — `Route`, `SparkRuntime`, `DetectionResult`, exceptions + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py` + +- [ ] **Step 1: Write the failing test** + +Create `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.types``.""" + +import pytest + +from spark_rapids_tools.tools.eventlog_detector.types import ( + DetectionResult, + EventLogDetectionError, + EventLogReadError, + Route, + SparkRuntime, + Termination, + UnsupportedCompressionError, + UnsupportedInputError, +) + + +class TestRoute: + def test_has_three_values(self): + assert {r.value for r in Route} == {"QUALIFICATION", "PROFILING", "UNKNOWN"} + + def test_is_string_enum(self): + # str subclass means aether can compare against plain strings. + assert Route.PROFILING == "PROFILING" + + +class TestSparkRuntime: + def test_values_match_scala_enum_exactly(self): + # These strings match org.apache.spark.sql.rapids.tool.util.SparkRuntime + # which aether already persists in JobRun.spark_runtime. + assert {r.value for r in SparkRuntime} == { + "SPARK", + "SPARK_RAPIDS", + "PHOTON", + "AURON", + } + + def test_is_string_enum(self): + assert SparkRuntime.SPARK_RAPIDS == "SPARK_RAPIDS" + + +class TestTermination: + def test_has_three_modes(self): + assert {t.name for t in Termination} == {"DECISIVE", "EXHAUSTED", "CAP_HIT"} + + +class TestDetectionResult: + def test_frozen_dataclass(self): + result = DetectionResult( + route=Route.PROFILING, + spark_runtime=SparkRuntime.SPARK_RAPIDS, + app_id="app-1", + spark_version="3.5.1", + event_log_path="/tmp/x", + source_path="/tmp/x", + reason="decisive: classified as SPARK_RAPIDS", + ) + with pytest.raises((AttributeError, Exception)): + result.route = Route.UNKNOWN # type: ignore[misc] + + def test_accepts_optional_fields_as_none(self): + result = DetectionResult( + route=Route.UNKNOWN, + spark_runtime=None, + app_id=None, + spark_version=None, + event_log_path="/tmp/x", + source_path="/tmp/x", + reason="no decisive signal within bounded scan", + ) + assert result.route is Route.UNKNOWN + assert result.spark_runtime is None + + +class TestExceptionHierarchy: + def test_all_errors_subclass_base(self): + for cls in ( + UnsupportedInputError, + UnsupportedCompressionError, + EventLogReadError, + ): + assert issubclass(cls, EventLogDetectionError) + + def test_base_is_value_error(self): + # Using ValueError as the common ancestor so callers who catch + # ValueError (a reasonable default for bad input) still see these. + assert issubclass(EventLogDetectionError, ValueError) +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py -v` +Expected: `ModuleNotFoundError: No module named 'spark_rapids_tools.tools.eventlog_detector.types'`. + +- [ ] **Step 3: Implement `types.py`** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Types, enums, and exceptions for the event log runtime detector.""" + +from dataclasses import dataclass +from enum import Enum +from typing import Optional + + +class Route(str, Enum): + """Routing decision returned to the caller.""" + + QUALIFICATION = "QUALIFICATION" + PROFILING = "PROFILING" + UNKNOWN = "UNKNOWN" + + +class SparkRuntime(str, Enum): + """Runtime taxonomy. + + Values match ``org.apache.spark.sql.rapids.tool.util.SparkRuntime`` in the + Scala core so aether's ``JobRun.spark_runtime`` column stays compatible. + """ + + SPARK = "SPARK" + SPARK_RAPIDS = "SPARK_RAPIDS" + PHOTON = "PHOTON" + AURON = "AURON" + + +class Termination(Enum): + """How the scanner stopped. Used internally by the detector flow.""" + + DECISIVE = "DECISIVE" # classification returned non-SPARK + EXHAUSTED = "EXHAUSTED" # walked every file to EOF under the budget + CAP_HIT = "CAP_HIT" # hit max_events_scanned before exhausting files + + +@dataclass(frozen=True) +class DetectionResult: + """Result returned by ``detect_spark_runtime``. + + ``spark_runtime`` is best-effort metadata. ``None`` is valid (e.g., when + ``route`` is ``UNKNOWN`` because env-update was never seen). + """ + + route: Route + spark_runtime: Optional[SparkRuntime] + app_id: Optional[str] + spark_version: Optional[str] + event_log_path: str + source_path: str + reason: str + + +class EventLogDetectionError(ValueError): + """Base class for detector errors.""" + + +class UnsupportedInputError(EventLogDetectionError): + """Input shape is outside V1 scope (multi-app dir, wildcard, comma list, ...).""" + + +class UnsupportedCompressionError(EventLogDetectionError): + """File uses a compression codec the V1 detector does not handle.""" + + +class EventLogReadError(EventLogDetectionError): + """Wraps an underlying I/O failure when reading the event log.""" +``` + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py -v` +Expected: all 7 test cases pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py +git commit -m "feat(eventlog_detector): routing types and exception hierarchy" +``` + +--- + +## Task 4: Markers module (single source of truth pinned to Scala) + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py` + +- [ ] **Step 1: Write the failing test** + +Create `test_markers.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.markers``.""" + +import re + +from spark_rapids_tools.tools.eventlog_detector import markers as m + + +class TestGpuMarkers: + def test_plugin_substring_exact(self): + assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" + + def test_gpu_toggle_key_exact(self): + assert m.GPU_ENABLED_KEY == "spark.rapids.sql.enabled" + + +class TestAuronMarkers: + def test_extension_regex_fullmatches_expected_value(self): + # Mirrors AuronParseHelper.extensionRegxMap. + pat = re.compile(m.AURON_EXTENSION_REGEX) + assert pat.fullmatch("org.apache.spark.sql.AuronSparkSessionExtension") + assert pat.fullmatch("whatever.AuronSparkSessionExtension.more") + assert not pat.fullmatch("org.apache.spark.sql.SomeOtherExtension") + + def test_auron_enabled_defaults_to_true(self): + assert m.AURON_ENABLED_DEFAULT == "true" + assert m.AURON_ENABLED_KEY == "spark.auron.enabled" + assert m.AURON_SPARK_EXTENSIONS_KEY == "spark.sql.extensions" + + +class TestDatabricksPrecondition: + def test_all_three_tag_keys_present(self): + assert m.DB_PRECONDITION_KEYS == ( + "spark.databricks.clusterUsageTags.clusterAllTags", + "spark.databricks.clusterUsageTags.clusterId", + "spark.databricks.clusterUsageTags.clusterName", + ) + + +class TestPhotonMarkers: + def test_marker_map_fullmatches_expected(self): + pats = {k: re.compile(v) for k, v in m.PHOTON_MARKER_REGEX.items()} + assert pats[ + "spark.databricks.clusterUsageTags.sparkVersion" + ].fullmatch("11.3.x-photon-scala2.12") + assert pats[ + "spark.databricks.clusterUsageTags.runtimeEngine" + ].fullmatch("PHOTON") + assert not pats[ + "spark.databricks.clusterUsageTags.runtimeEngine" + ].fullmatch("STANDARD") + + def test_all_four_photon_keys(self): + assert set(m.PHOTON_MARKER_REGEX) == { + "spark.databricks.clusterUsageTags.sparkVersion", + "spark.databricks.clusterUsageTags.effectiveSparkVersion", + "spark.databricks.clusterUsageTags.sparkImageLabel", + "spark.databricks.clusterUsageTags.runtimeEngine", + } + + +class TestDatabricksRollingFileName: + def test_prefix_is_eventlog(self): + assert m.DB_EVENT_LOG_FILE_PREFIX == "eventlog" + + def test_date_pattern_parses_scala_format(self): + pat = re.compile(m.DB_EVENT_LOG_DATE_REGEX) + # Scala's getDBEventLogFileDate splits on '--' and parses + # 'eventlog-YYYY-MM-DD--HH-MM[.codec]'. + assert pat.search("eventlog-2021-06-14--20-00.gz") + assert pat.search("eventlog-2021-06-14--20-00") + assert not pat.search("eventlog") # bare eventlog has no date +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py -v` +Expected: `ModuleNotFoundError`. + +- [ ] **Step 3: Implement `markers.py`** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Single source of truth for runtime-detection markers. + +Each constant below is pinned to a specific Scala source location. If +the Scala rule changes, update the constant here and the parity tests +under ``tests/spark_rapids_tools_ut/tools/eventlog_detector/`` will +catch any drift on a re-run. +""" + +from typing import Mapping, Tuple + +# --------------------------------------------------------------------------- +# SPARK_RAPIDS (GPU) markers +# Scala source: core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121 +# --------------------------------------------------------------------------- +GPU_PLUGIN_KEY: str = "spark.plugins" +GPU_PLUGIN_CLASS_SUBSTRING: str = "com.nvidia.spark.SQLPlugin" +GPU_ENABLED_KEY: str = "spark.rapids.sql.enabled" +# Default when GPU_ENABLED_KEY is missing or unparseable as bool. Matches +# Scala ``Try { ... }.getOrElse(true)`` in isPluginEnabled. +GPU_ENABLED_DEFAULT: bool = True + +# --------------------------------------------------------------------------- +# AURON markers +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172 +# --------------------------------------------------------------------------- +AURON_SPARK_EXTENSIONS_KEY: str = "spark.sql.extensions" +AURON_EXTENSION_REGEX: str = r".*AuronSparkSessionExtension.*" +AURON_ENABLED_KEY: str = "spark.auron.enabled" +AURON_ENABLED_DEFAULT: str = "true" + +# --------------------------------------------------------------------------- +# Databricks precondition (all three keys must be non-empty) +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58 +# and DatabricksParseHelper.scala:188-190 +# --------------------------------------------------------------------------- +DB_PRECONDITION_KEYS: Tuple[str, str, str] = ( + "spark.databricks.clusterUsageTags.clusterAllTags", + "spark.databricks.clusterUsageTags.clusterId", + "spark.databricks.clusterUsageTags.clusterName", +) + +# --------------------------------------------------------------------------- +# PHOTON markers (any one fullmatches once Databricks precondition holds) +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151 +# --------------------------------------------------------------------------- +PHOTON_MARKER_REGEX: Mapping[str, str] = { + "spark.databricks.clusterUsageTags.sparkVersion": r".*-photon-.*", + "spark.databricks.clusterUsageTags.effectiveSparkVersion": r".*-photon-.*", + "spark.databricks.clusterUsageTags.sparkImageLabel": r".*-photon-.*", + "spark.databricks.clusterUsageTags.runtimeEngine": r"PHOTON", +} + +# --------------------------------------------------------------------------- +# Databricks rolling event-log file layout +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:57 +# and :458-478 (date parse in getDBEventLogFileDate) +# --------------------------------------------------------------------------- +DB_EVENT_LOG_FILE_PREFIX: str = "eventlog" +# Matches the dated form ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` used by +# ``DatabricksRollingEventLogFilesFileReader``. Bare ``eventlog`` has no +# match and is treated as "latest" (sorted last) by the resolver. +DB_EVENT_LOG_DATE_REGEX: str = ( + r"^eventlog-(\d{4})-(\d{2})-(\d{2})--(\d{2})-(\d{2})(?:\.[A-Za-z0-9]+)?$" +) + +# --------------------------------------------------------------------------- +# Supported Spark listener event names +# --------------------------------------------------------------------------- +EVENT_LOG_START: str = "SparkListenerLogStart" +EVENT_APPLICATION_START: str = "SparkListenerApplicationStart" +EVENT_ENVIRONMENT_UPDATE: str = "SparkListenerEnvironmentUpdate" +EVENT_SQL_EXECUTION_START: str = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" +# Spark's actual SQLExecutionStart class name carries a package prefix in +# event logs. The unqualified shortname is sometimes used in test fixtures. +EVENT_SQL_EXECUTION_START_SHORTNAME: str = "SparkListenerSQLExecutionStart" +``` + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py -v` +Expected: all assertions pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py +git commit -m "feat(eventlog_detector): Scala-pinned markers module" +``` + +--- + +## Task 5: Classifier (pure function over a property dict) + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py` + +- [ ] **Step 1: Write the failing test** + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.classifier``.""" + +import pytest + +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime + + +class TestEmptyProperties: + def test_empty_props_is_spark(self): + assert _classify_runtime({}) is SparkRuntime.SPARK + + +class TestSparkRapids: + def test_plugin_and_default_enabled(self): + props = {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin,bar"} + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + def test_plugin_with_enabled_true(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "true", + } + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + def test_plugin_with_enabled_false_demotes_to_spark(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + } + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_enabled_true_without_plugin_is_still_spark(self): + props = {"spark.rapids.sql.enabled": "true"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_unparseable_enabled_defaults_to_true(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "not-a-bool", + } + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + +class TestAuron: + def test_extension_and_default_enabled(self): + props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} + assert _classify_runtime(props) is SparkRuntime.AURON + + def test_extension_and_enabled_false_demotes_to_spark(self): + props = { + "spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension", + "spark.auron.enabled": "FALSE", + } + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_auron_enabled_case_insensitive(self): + props = { + "spark.sql.extensions": "AuronSparkSessionExtension", + "spark.auron.enabled": " TrUe ", + } + assert _classify_runtime(props) is SparkRuntime.AURON + + +class TestDatabricksPhoton: + @pytest.fixture + def db_precond_props(self): + return { + "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", + "spark.databricks.clusterUsageTags.clusterId": "1234", + "spark.databricks.clusterUsageTags.clusterName": "dev-cluster", + } + + def test_precondition_only_is_spark(self, db_precond_props): + assert _classify_runtime(db_precond_props) is SparkRuntime.SPARK + + def test_precondition_plus_photon_version(self, db_precond_props): + props = { + **db_precond_props, + "spark.databricks.clusterUsageTags.sparkVersion": "11.3.x-photon-scala2.12", + } + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_precondition_plus_photon_engine(self, db_precond_props): + props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_photon_marker_without_precondition_is_spark(self): + props = {"spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_photon_engine_other_value_is_spark(self, db_precond_props): + props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "STANDARD"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + +class TestPriority: + """PHOTON > AURON > SPARK_RAPIDS > SPARK when markers coexist.""" + + def test_photon_beats_spark_rapids(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", + "spark.databricks.clusterUsageTags.clusterId": "1", + "spark.databricks.clusterUsageTags.clusterName": "c", + "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON", + } + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_auron_beats_spark_rapids(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.sql.extensions": "AuronSparkSessionExtension", + } + assert _classify_runtime(props) is SparkRuntime.AURON +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py -v` +Expected: `ModuleNotFoundError`. + +- [ ] **Step 3: Implement the classifier** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pure-function runtime classifier. + +``_classify_runtime`` maps a merged Spark properties dict to a +``SparkRuntime``. Priority order when multiple markers are present: +``PHOTON > AURON > SPARK_RAPIDS > SPARK``. This is a deterministic Python +choice; Scala's plugin iteration order is undefined when multiple +plugins claim a runtime, but in practice markers do not overlap. +""" + +import re +from typing import Mapping + +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime + + +_TRUE_STRINGS = {"true", "1", "yes"} +_FALSE_STRINGS = {"false", "0", "no"} + + +def _parse_bool(raw: str, default: bool) -> bool: + """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``.""" + stripped = raw.strip().lower() + if stripped in _TRUE_STRINGS: + return True + if stripped in _FALSE_STRINGS: + return False + return default + + +def _is_spark_rapids(props: Mapping[str, str]) -> bool: + plugins = props.get(m.GPU_PLUGIN_KEY, "") + if m.GPU_PLUGIN_CLASS_SUBSTRING not in plugins: + return False + raw = props.get(m.GPU_ENABLED_KEY) + if raw is None: + return m.GPU_ENABLED_DEFAULT + return _parse_bool(raw, default=m.GPU_ENABLED_DEFAULT) + + +def _is_auron(props: Mapping[str, str]) -> bool: + extensions = props.get(m.AURON_SPARK_EXTENSIONS_KEY) + if extensions is None or not re.fullmatch(m.AURON_EXTENSION_REGEX, extensions): + return False + enabled_raw = props.get(m.AURON_ENABLED_KEY, m.AURON_ENABLED_DEFAULT) + return enabled_raw.strip().lower() == m.AURON_ENABLED_DEFAULT + + +def _is_databricks(props: Mapping[str, str]) -> bool: + return all(props.get(k, "").strip() for k in m.DB_PRECONDITION_KEYS) + + +def _is_photon(props: Mapping[str, str]) -> bool: + if not _is_databricks(props): + return False + for key, pattern in m.PHOTON_MARKER_REGEX.items(): + value = props.get(key) + if value is not None and re.fullmatch(pattern, value): + return True + return False + + +def _classify_runtime(props: Mapping[str, str]) -> SparkRuntime: + # Priority: PHOTON > AURON > SPARK_RAPIDS > SPARK. + if _is_photon(props): + return SparkRuntime.PHOTON + if _is_auron(props): + return SparkRuntime.AURON + if _is_spark_rapids(props): + return SparkRuntime.SPARK_RAPIDS + return SparkRuntime.SPARK +``` + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py -v` +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +git commit -m "feat(eventlog_detector): classifier mirroring Scala priority" +``` + +--- + +## Task 6: Stream opener (context-managed codec-aware line iterator) + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py` + +- [ ] **Step 1: Write the failing test** + +Create `test_stream.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.stream``.""" + +import gzip +from pathlib import Path + +import pytest +import zstandard as zstd + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.types import ( + EventLogReadError, + UnsupportedCompressionError, +) + + +SAMPLE_LINES = [ + '{"Event":"SparkListenerLogStart","Spark Version":"3.5.1"}', + '{"Event":"SparkListenerApplicationStart","App ID":"app-1"}', + '{"Event":"SparkListenerEnvironmentUpdate","Spark Properties":{}}', +] + + +def _write_plain(path: Path) -> None: + path.write_text("\n".join(SAMPLE_LINES) + "\n", encoding="utf-8") + + +def _write_gz(path: Path) -> None: + with gzip.open(path, "wt", encoding="utf-8") as fh: + fh.write("\n".join(SAMPLE_LINES) + "\n") + + +def _write_zstd(path: Path) -> None: + cctx = zstd.ZstdCompressor() + raw = ("\n".join(SAMPLE_LINES) + "\n").encode("utf-8") + path.write_bytes(cctx.compress(raw)) + + +@pytest.fixture +def plain_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.inprogress" + _write_plain(p) + return CspPath(str(p)) + + +@pytest.fixture +def gz_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.gz" + _write_gz(p) + return CspPath(str(p)) + + +@pytest.fixture +def zstd_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.zstd" + _write_zstd(p) + return CspPath(str(p)) + + +class TestPlainStream: + def test_yields_all_lines(self, plain_file): + with _open_event_log_stream(plain_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestGzipStream: + def test_yields_all_lines(self, gz_file): + with _open_event_log_stream(gz_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestZstdStream: + def test_yields_all_lines(self, zstd_file): + with _open_event_log_stream(zstd_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + def test_zst_short_suffix_also_works(self, tmp_path): + p = tmp_path / "eventlog.zst" + _write_zstd(p) + with _open_event_log_stream(CspPath(str(p))) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestUnsupportedCompression: + def test_lz4_raises(self, tmp_path): + p = tmp_path / "eventlog.lz4" + p.write_bytes(b"not-real-lz4") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + def test_snappy_raises(self, tmp_path): + p = tmp_path / "eventlog.snappy" + p.write_bytes(b"not-real-snappy") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + def test_lzf_raises(self, tmp_path): + p = tmp_path / "eventlog.lzf" + p.write_bytes(b"not-real-lzf") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + +class TestIoFailure: + def test_missing_file_raises_read_error(self, tmp_path): + p = tmp_path / "does-not-exist" + with pytest.raises(EventLogReadError): + with _open_event_log_stream(CspPath(str(p))) as lines: + next(iter(lines)) +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py -v` +Expected: `ModuleNotFoundError`. + +- [ ] **Step 3: Implement the stream opener** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Codec-aware context-managed line streamer for Spark event logs. + +Opens the file via ``CspPath.open_input_stream()``, applies the right +decompression layer based on extension, wraps in a text decoder, and +yields an ``Iterator[str]``. On exit the context manager closes every +layer in reverse order. Streaming only — no buffering of the full file. +""" + +import contextlib +import gzip +import io +from typing import Iterator + +import zstandard as zstd + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.types import ( + EventLogReadError, + UnsupportedCompressionError, +) + + +_GZIP_SUFFIXES = {".gz"} +_ZSTD_SUFFIXES = {".zstd", ".zst"} +# Suffixes that unambiguously indicate a codec we do not support in V1. +_UNSUPPORTED_CODEC_SUFFIXES = {".lz4", ".lzf", ".snappy"} +# Treated as plain text. +_PLAIN_SUFFIXES = {".inprogress", ""} + + +def _classify_suffix(path: CspPath) -> str: + name = path.base_name().lower() + dot = name.rfind(".") + if dot < 0: + return "" + return name[dot:] + + +@contextlib.contextmanager +def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: + suffix = _classify_suffix(path) + if suffix in _UNSUPPORTED_CODEC_SUFFIXES: + raise UnsupportedCompressionError( + f"Compression codec '{suffix}' is not supported by the lightweight " + "event log detector. Fall back to the full qualification/profiling " + "pipeline for this log." + ) + + try: + byte_stream = path.open_input_stream() + except Exception as exc: + raise EventLogReadError(f"Failed to open event log {path}: {exc}") from exc + + close_stack = contextlib.ExitStack() + close_stack.callback(byte_stream.close) + try: + if suffix in _GZIP_SUFFIXES: + decompressed: io.IOBase = gzip.GzipFile(fileobj=byte_stream, mode="rb") + close_stack.callback(decompressed.close) + elif suffix in _ZSTD_SUFFIXES: + dctx = zstd.ZstdDecompressor() + # stream_reader supports .read(); we need a readable binary layer + # below TextIOWrapper. read1 emulation is good enough for line iter. + decompressed = dctx.stream_reader(byte_stream) + close_stack.callback(decompressed.close) + elif suffix in _PLAIN_SUFFIXES or suffix not in _UNSUPPORTED_CODEC_SUFFIXES: + # Unknown/empty suffix → best-effort treat as plain text. If the + # file is actually compressed with an unknown codec the scanner + # will simply see garbled lines that don't parse as JSON and be + # skipped; env-update will never be reached and the caller will + # see Route.UNKNOWN. That is the right failure mode here. + decompressed = byte_stream + else: # pragma: no cover — every branch covered above + raise UnsupportedCompressionError(f"Unsupported suffix: {suffix}") + + # Line-at-a-time text iterator over the decompressed stream. + text = io.TextIOWrapper(decompressed, encoding="utf-8", errors="replace", newline="") + close_stack.callback(text.close) + + def line_iter() -> Iterator[str]: + for raw in text: + # Strip the trailing newline to match the "one event per line" + # contract. Empty lines are legal and skipped by the caller. + yield raw.rstrip("\r\n") + + try: + yield line_iter() + except Exception as exc: + # Convert any read-time I/O error into a typed domain error. + raise EventLogReadError(f"Error reading event log {path}: {exc}") from exc + finally: + close_stack.close() +``` + +Note on the suffix decision tree: `_PLAIN_SUFFIXES` intentionally includes `""` (no extension) so paths like `eventlog` or `foo/events_1_app-xyz` are read as plain text. + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py -v` +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +git commit -m "feat(eventlog_detector): context-managed codec-aware line streamer" +``` + +--- + +## Task 7: Path resolver (single file + Databricks rolling dir) + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py` + +- [ ] **Step 1: Write the failing test** + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.resolver``.""" + +from datetime import datetime +from pathlib import Path + +import pytest + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.resolver import ( + _parse_databricks_file_datetime, + _resolve_event_log_files, +) +from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError + + +class TestDatabricksDateParse: + def test_bare_eventlog_is_latest_sentinel(self): + # Returns None; caller treats None as "sort last". + assert _parse_databricks_file_datetime("eventlog") is None + + def test_dated_file_parses(self): + dt = _parse_databricks_file_datetime("eventlog-2021-06-14--20-00.gz") + assert dt == datetime(2021, 6, 14, 20, 0) + + def test_dated_file_without_codec_parses(self): + dt = _parse_databricks_file_datetime("eventlog-2022-01-02--03-04") + assert dt == datetime(2022, 1, 2, 3, 4) + + def test_non_eventlog_prefix_returns_none(self): + assert _parse_databricks_file_datetime("application_1234.log") is None + + +class TestResolveSingleFile: + def test_single_file_returns_single_element_list(self, tmp_path: Path): + f = tmp_path / "eventlog.zstd" + f.write_bytes(b"x") + source, files = _resolve_event_log_files(CspPath(str(f))) + assert source == str(f) + assert [p.base_name() for p in files] == ["eventlog.zstd"] + + +class TestResolveDatabricksRollingDir: + def test_orders_earliest_first_and_bare_eventlog_last(self, tmp_path: Path): + d = tmp_path / "dbrolling" + d.mkdir() + (d / "eventlog").write_bytes(b"") + (d / "eventlog-2021-06-14--20-00.gz").write_bytes(b"") + (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") + source, files = _resolve_event_log_files(CspPath(str(d))) + assert source == str(d) + names = [p.base_name() for p in files] + # Earliest dated file first; bare `eventlog` sorts last (treated as + # "current/latest" per Scala). + assert names == [ + "eventlog-2021-06-14--18-00.gz", + "eventlog-2021-06-14--20-00.gz", + "eventlog", + ] + + def test_dir_with_no_eventlog_prefix_raises(self, tmp_path: Path): + d = tmp_path / "empty" + d.mkdir() + (d / "application_1.log").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + def test_empty_dir_raises(self, tmp_path: Path): + d = tmp_path / "blank" + d.mkdir() + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + +class TestResolveUnsupportedShapes: + def test_spark_native_rolling_dir_raises(self, tmp_path: Path): + d = tmp_path / "eventlog_v2_local-1623876083964" + d.mkdir() + (d / "events_1_local-1623876083964").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + def test_generic_multi_app_dir_raises(self, tmp_path: Path): + d = tmp_path / "multi" + d.mkdir() + (d / "app-1.zstd").write_bytes(b"") + (d / "app-2.zstd").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py -v` +Expected: `ModuleNotFoundError`. + +- [ ] **Step 3: Implement the resolver** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Input-path resolution for the event log detector. + +Resolves one user-supplied path into a list of one or more concrete files +to read, in scan order. Supports a single file or a Databricks rolling +directory. Spark-native rolling, generic multi-app directories, wildcards, +and comma lists raise ``UnsupportedInputError`` and are expected to fall +back to the full Scala pipeline. +""" + +import re +from datetime import datetime +from typing import List, Optional, Tuple + +from spark_rapids_tools.storagelib import CspFs, CspPath +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError + + +_DB_DATE_PATTERN = re.compile(m.DB_EVENT_LOG_DATE_REGEX) + + +def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: + """Parse ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` to a datetime. + + Returns ``None`` for bare ``eventlog`` and any name that does not match + the dated pattern. The caller sorts ``None`` as "latest/current" to + mirror Scala's ``getDBEventLogFileDate`` which defaults to ``now()``. + """ + if not name.startswith(m.DB_EVENT_LOG_FILE_PREFIX): + return None + match = _DB_DATE_PATTERN.match(name) + if match is None: + return None + year, month, day, hour, minute = (int(g) for g in match.groups()) + return datetime(year, month, day, hour, minute) + + +def _is_databricks_event_log_filename(name: str) -> bool: + return name.startswith(m.DB_EVENT_LOG_FILE_PREFIX) + + +def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: + """Resolve ``path`` to an ordered list of files to scan. + + Returns ``(source, files)`` where ``source`` is the original input + rendered as a string (preserved for the ``DetectionResult``) and + ``files`` is the scan order. + """ + source = str(path) + + if path.is_file(): + return source, [path] + + if not path.is_dir(): + raise UnsupportedInputError( + f"Path is neither a file nor a directory: {source}" + ) + + # Directory: must be a Databricks rolling dir. Spark-native rolling + # (eventlog_v2_*) and generic multi-app directories are out of scope. + children = CspFs.list_all_files(path) + db_files = [c for c in children if _is_databricks_event_log_filename(c.base_name())] + if not db_files: + raise UnsupportedInputError( + f"Directory {source} is not a supported input shape. The detector " + "handles single files or Databricks rolling directories only; fall " + "back to the full pipeline for Spark-native rolling, multi-app " + "directories, wildcards, or comma-separated inputs." + ) + + # Sort mirroring DatabricksRollingEventLogFilesFileReader: dated files + # ascending by parsed datetime, bare `eventlog` last (treated as + # "latest/current"). Stable sort on filename first to keep ordering + # deterministic among equal-date files (extremely unlikely in practice + # but cheap insurance for tests). + db_files.sort(key=lambda f: f.base_name()) + db_files.sort( + key=lambda f: ( + _parse_databricks_file_datetime(f.base_name()) or datetime.max, + ) + ) + return source, db_files +``` + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py -v` +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +git commit -m "feat(eventlog_detector): single-file and Databricks rolling-dir resolver" +``` + +--- + +## Task 8: Event scanner (streaming, bounded, multi-file) + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py` + +- [ ] **Step 1: Write the failing test** + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for ``eventlog_detector.scanner``.""" + +from pathlib import Path +from typing import List + +import pytest + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.scanner import ( + _scan_events, + _scan_events_across, +) +from spark_rapids_tools.tools.eventlog_detector.types import ( + SparkRuntime, + Termination, +) + + +# ---------- Line builders ---------- + +def env_update(props: dict) -> str: + import json + + return json.dumps( + { + "Event": "SparkListenerEnvironmentUpdate", + "Spark Properties": props, + "System Properties": {}, + "Classpath Entries": {}, + "JVM Information": {}, + } + ) + + +def log_start(version: str = "3.5.1") -> str: + import json + + return json.dumps({"Event": "SparkListenerLogStart", "Spark Version": version}) + + +def app_start(app_id: str = "app-1", app_name: str = "App") -> str: + import json + + return json.dumps( + { + "Event": "SparkListenerApplicationStart", + "App ID": app_id, + "App Name": app_name, + } + ) + + +def sql_exec_start(modified_configs: dict) -> str: + import json + + return json.dumps( + { + "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", + "executionId": 0, + "description": "x", + "details": "", + "physicalPlanDescription": "", + "sparkPlanInfo": {}, + "time": 0, + "modifiedConfigs": modified_configs, + } + ) + + +# ---------- Tests for _scan_events (single stream) ---------- + +class TestScanEvents: + def test_env_update_with_gpu_is_decisive(self): + lines = iter( + [ + log_start(), + app_start(), + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.app_id == "app-1" + assert result.spark_version == "3.5.1" + assert result.termination is Termination.DECISIVE + + def test_env_update_cpu_then_sql_start_flips_to_gpu(self): + lines = iter( + [ + log_start(), + app_start(), + env_update({"spark.master": "local"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.DECISIVE + + def test_cpu_only_to_eof_is_exhausted(self): + lines = iter([log_start(), app_start(), env_update({"spark.master": "local"})]) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.termination is Termination.EXHAUSTED + + def test_no_env_update_within_budget_is_cap_hit(self): + # Budget less than the number of events, none of them env-update. + lines = iter([log_start()] * 5) + result = _scan_events(lines, budget=2) + assert result.env_update_seen is False + assert result.termination is Termination.CAP_HIT + + def test_no_env_update_to_eof_is_exhausted_without_env(self): + lines = iter([log_start(), app_start()]) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is False + assert result.termination is Termination.EXHAUSTED + + def test_malformed_json_lines_are_skipped(self): + lines = iter( + [ + "not-json-at-all", + log_start(), + "", + app_start(), + env_update({"spark.master": "local"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.app_id == "app-1" + + def test_later_sql_start_overwrites_earlier_property(self): + # last-write-wins merge, matching CacheablePropsHandler.mergeModifiedConfigs. + lines = iter( + [ + env_update({"spark.rapids.sql.enabled": "false", "spark.plugins": "com.nvidia.spark.SQLPlugin"}), + sql_exec_start({"spark.rapids.sql.enabled": "true"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.DECISIVE + # Final accumulated props reflect the merge. + assert result.spark_properties["spark.rapids.sql.enabled"] == "true" + + +# ---------- Tests for _scan_events_across (multi-file) ---------- + +def _write(path: Path, lines: List[str]) -> CspPath: + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return CspPath(str(path)) + + +class TestScanEventsAcross: + def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): + f1 = _write( + tmp_path / "eventlog-2021-06-14--18-00", + [log_start(), app_start(), env_update({"spark.master": "local"})], + ) + f2 = _write( + tmp_path / "eventlog-2021-06-14--20-00", + [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})], + ) + result = _scan_events_across([f1, f2], budget=100) + assert result.termination is Termination.DECISIVE + + def test_shared_budget_applied_across_files(self, tmp_path): + # 3 events in first file, 3 in second. Budget = 4. Second file stops + # after one event, before any GPU signal. + f1 = _write(tmp_path / "a", [log_start(), app_start(), env_update({"spark.master": "local"})]) + f2 = _write( + tmp_path / "b", + [ + sql_exec_start({"spark.master": "still-cpu"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + sql_exec_start({"x": "y"}), + ], + ) + result = _scan_events_across([f1, f2], budget=4) + assert result.termination is Termination.CAP_HIT + + def test_all_files_exhausted_returns_exhausted(self, tmp_path): + f1 = _write(tmp_path / "a", [env_update({"spark.master": "local"})]) + result = _scan_events_across([f1], budget=100) + assert result.termination is Termination.EXHAUSTED +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py -v` +Expected: `ModuleNotFoundError`. + +- [ ] **Step 3: Implement the scanner** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Bounded streaming event scanner. + +Walks JSON-per-line event logs under a shared event budget, merges +properties from startup and per-SQL events into a single mutable dict, +and reports early-stop/exhausted/cap-hit termination. Strictly streaming: +no slurping, no accumulating raw events. +""" + +import json +from dataclasses import dataclass, field +from typing import Dict, Iterable, Iterator, List, Optional + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, Termination + + +@dataclass +class _ScanResult: + spark_properties: Dict[str, str] = field(default_factory=dict) + app_id: Optional[str] = None + app_name: Optional[str] = None + spark_version: Optional[str] = None + env_update_seen: bool = False + events_scanned: int = 0 + termination: Termination = Termination.EXHAUSTED + + +def _scan_events( + lines: Iterable[str], + *, + budget: int, + state: Optional[_ScanResult] = None, +) -> _ScanResult: + """Scan one stream of lines, optionally continuing from a prior state. + + Returns the updated ``_ScanResult``. Terminates as soon as classification + turns non-SPARK (``DECISIVE``), or when the budget is exhausted + (``CAP_HIT``), or when ``lines`` is fully consumed (``EXHAUSTED``). + """ + result = state if state is not None else _ScanResult() + + for raw in lines: + if result.events_scanned >= budget: + result.termination = Termination.CAP_HIT + return result + + if not raw: + continue + + try: + event = json.loads(raw) + except (json.JSONDecodeError, ValueError): + # Malformed JSON line (Spark tolerates trailing partials). Count + # it against the budget so a pathological log can't stall us. + result.events_scanned += 1 + continue + + result.events_scanned += 1 + name = event.get("Event") + if name == m.EVENT_LOG_START: + version = event.get("Spark Version") + if isinstance(version, str): + result.spark_version = version + elif name == m.EVENT_APPLICATION_START: + app_id = event.get("App ID") + app_name = event.get("App Name") + if isinstance(app_id, str): + result.app_id = app_id + if isinstance(app_name, str): + result.app_name = app_name + elif name == m.EVENT_ENVIRONMENT_UPDATE: + props = event.get("Spark Properties") or {} + if isinstance(props, dict): + for k, v in props.items(): + if isinstance(k, str) and isinstance(v, str): + result.spark_properties[k] = v + result.env_update_seen = True + if _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK: + result.termination = Termination.DECISIVE + return result + elif name in (m.EVENT_SQL_EXECUTION_START, m.EVENT_SQL_EXECUTION_START_SHORTNAME): + modified = event.get("modifiedConfigs") or {} + if isinstance(modified, dict) and modified: + for k, v in modified.items(): + if isinstance(k, str) and isinstance(v, str): + result.spark_properties[k] = v + if result.env_update_seen and ( + _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK + ): + result.termination = Termination.DECISIVE + return result + + # Fully consumed without early-stop or budget exhaustion. + result.termination = Termination.EXHAUSTED + return result + + +def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: + """Walk ``files`` in order under a single shared ``budget``.""" + state = _ScanResult() + for path in files: + if state.events_scanned >= budget: + state.termination = Termination.CAP_HIT + return state + with _open_event_log_stream(path) as lines: + state = _scan_events(lines, budget=budget, state=state) + if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): + return state + # All files consumed. + state.termination = Termination.EXHAUSTED + return state +``` + +- [ ] **Step 4: Run the test and confirm it passes** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py -v` +Expected: all tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py +git commit -m "feat(eventlog_detector): bounded streaming scanner across files" +``` + +--- + +## Task 9: Top-level `detect_spark_runtime` + +**Files:** + +- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py` +- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py` + +- [ ] **Step 1: Write the failing integration test** + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Integration tests for ``eventlog_detector.detect_spark_runtime``.""" + +import gzip +import json +from pathlib import Path + +import pytest +import zstandard as zstd + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime +from spark_rapids_tools.tools.eventlog_detector.types import ( + Route, + SparkRuntime, + UnsupportedInputError, +) + + +def env_update(props: dict) -> dict: + return { + "Event": "SparkListenerEnvironmentUpdate", + "Spark Properties": props, + "System Properties": {}, + "Classpath Entries": {}, + "JVM Information": {}, + } + + +def _write_plain_log(path: Path, events: list) -> None: + path.write_text( + "\n".join(json.dumps(e) for e in events) + "\n", encoding="utf-8" + ) + + +class TestAcceptsStringPath: + def test_str_input_resolves(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "a", "App Name": "A"}, + env_update({"spark.master": "local"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert result.route is Route.QUALIFICATION + assert result.spark_runtime is SparkRuntime.SPARK + + +class TestGpuLog: + def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "g", "App Name": "G"}, + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) + result = detect_spark_runtime(CspPath(str(log))) + assert result.route is Route.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + assert result.app_id == "g" + assert result.spark_version == "3.5.1" + + +class TestCapHit: + def test_no_env_update_before_cap_is_unknown(self, tmp_path): + log = tmp_path / "eventlog" + # Many LogStart events, no env-update. Cap hits first. + _write_plain_log( + log, + [{"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}] * 10, + ) + result = detect_spark_runtime(str(log), max_events_scanned=5) + assert result.route is Route.UNKNOWN + assert result.spark_runtime is None + assert "no decisive signal" in result.reason + + +class TestDatabricksRolling: + def test_gpu_in_later_rolled_file(self, tmp_path): + d = tmp_path / "dbrolling" + d.mkdir() + _write_plain_log( + d / "eventlog-2021-06-14--18-00", + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "d", "App Name": "D"}, + env_update({"spark.master": "local"}), + ], + ) + _write_plain_log( + d / "eventlog", + [ + { + "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", + "executionId": 0, + "description": "", + "details": "", + "physicalPlanDescription": "", + "sparkPlanInfo": {}, + "time": 0, + "modifiedConfigs": {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, + } + ], + ) + result = detect_spark_runtime(CspPath(str(d))) + assert result.route is Route.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + + +class TestUnsupportedInput: + def test_spark_native_rolling_dir_raises(self, tmp_path): + d = tmp_path / "eventlog_v2_local-1623876083964" + d.mkdir() + (d / "events_1_local-1623876083964").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + detect_spark_runtime(CspPath(str(d))) + + +class TestReasonStrings: + def test_reason_mentions_runtime_on_profiling(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert "SPARK_RAPIDS" in result.reason + + def test_reason_mentions_full_log_on_qualification(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log(log, [env_update({"spark.master": "local"})]) + result = detect_spark_runtime(str(log)) + assert result.route is Route.QUALIFICATION + assert "walked full log" in result.reason +``` + +- [ ] **Step 2: Run the test and confirm it fails** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py -v` +Expected: `ModuleNotFoundError` (detector module missing). + +- [ ] **Step 3: Implement the detector** + +Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Top-level event log runtime detector. + +``detect_spark_runtime(path)`` returns a ``DetectionResult`` carrying the +routing decision and best-effort metadata. On inconclusive input, returns +``Route.UNKNOWN`` rather than raising; callers fall back to the full +Scala pipeline in that case. +""" + +from typing import Optional, Union + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.resolver import _resolve_event_log_files +from spark_rapids_tools.tools.eventlog_detector.scanner import _scan_events_across +from spark_rapids_tools.tools.eventlog_detector.types import ( + DetectionResult, + Route, + SparkRuntime, + Termination, +) + + +_GPU_FAMILY = frozenset({SparkRuntime.SPARK_RAPIDS, SparkRuntime.PHOTON, SparkRuntime.AURON}) + + +def detect_spark_runtime( + event_log: Union[str, CspPath], + *, + max_events_scanned: int = 500, +) -> DetectionResult: + """Classify a single-app event log into a routing decision. + + Returns ``DetectionResult`` with: + + * ``route`` = ``PROFILING`` for any decisive non-SPARK classification, + * ``QUALIFICATION`` only after the scanner walked the full log with + no GPU-family signal, + * ``UNKNOWN`` when the event budget was hit first or + ``SparkListenerEnvironmentUpdate`` was never seen. + + ``max_events_scanned`` caps CPU/IO cost; large CPU logs routinely end + as ``UNKNOWN`` at the cap. Raise the cap at the call site to trade + cost for decisiveness. + """ + path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) + source, files = _resolve_event_log_files(path) + + scan = _scan_events_across(files, budget=max_events_scanned) + + # Classify from whatever we accumulated. + runtime: Optional[SparkRuntime] + if scan.env_update_seen: + runtime = _classify_runtime(scan.spark_properties) + else: + runtime = None + + # Apply the asymmetric decision rule. + if runtime in _GPU_FAMILY: + route = Route.PROFILING + reason = f"decisive: classified as {runtime.value}" + elif scan.termination is Termination.EXHAUSTED and scan.env_update_seen: + route = Route.QUALIFICATION + reason = "walked full log, no GPU-family signal" + else: + route = Route.UNKNOWN + reason = ( + "no decisive signal within bounded scan" + if scan.env_update_seen + else "no SparkListenerEnvironmentUpdate reached" + ) + + resolved_path = str(files[0]) if files else source + return DetectionResult( + route=route, + spark_runtime=runtime, + app_id=scan.app_id, + spark_version=scan.spark_version, + event_log_path=resolved_path, + source_path=source, + reason=reason, + ) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py -v` +Expected: all tests pass. + +- [ ] **Step 5: Sanity-check the package-level re-exports** + +Run: `python -c "from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime, Route, DetectionResult; print('ok')"` +Expected: prints `ok`. + +- [ ] **Step 6: Commit** + +```bash +git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +git commit -m "feat(eventlog_detector): top-level detect_spark_runtime entry point" +``` + +--- + +## Task 10: Fixture tests against existing Scala event logs + +**Files:** + +- Create: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py` + +- [ ] **Step 1: Confirm the fixtures exist and are readable** + +Run these sanity checks: + +```bash +ls core/src/test/resources/spark-events-profiling/eventlog-gpu-dsv2.zstd +ls core/src/test/resources/spark-events-profiling/eventlog_dsv2.zstd +ls core/src/test/resources/spark-events-qualification/eventlog_same_app_id_1.zstd +``` + +Expected: all three paths print. If any is missing, add it to the `pytest.mark.skip` decorator in the test below and note it in Section 12 of the spec (open items). + +- [ ] **Step 2: Write the fixture tests** + +Create `test_detector_fixtures.py`: + +```python +# Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Anchor tests against fixtures already shipped in the Scala core. + +These are not a full parity sweep. They catch regressions on a small +curated set covering each decisive route. +""" + +from pathlib import Path + +import pytest + +from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime +from spark_rapids_tools.tools.eventlog_detector.types import Route, SparkRuntime + + +REPO_ROOT = Path(__file__).resolve().parents[5] +CORE_FIXTURES = REPO_ROOT / "core" / "src" / "test" / "resources" + + +@pytest.mark.parametrize( + "relative_path,expected_route,expected_runtime", + [ + ( + "spark-events-profiling/eventlog-gpu-dsv2.zstd", + Route.PROFILING, + SparkRuntime.SPARK_RAPIDS, + ), + ( + "spark-events-profiling/eventlog_dsv2.zstd", + Route.QUALIFICATION, + SparkRuntime.SPARK, + ), + ( + "spark-events-qualification/eventlog_same_app_id_1.zstd", + Route.QUALIFICATION, + SparkRuntime.SPARK, + ), + ], +) +def test_detector_matches_expected_route_on_scala_fixture( + relative_path: str, expected_route: Route, expected_runtime: SparkRuntime +) -> None: + fixture = CORE_FIXTURES / relative_path + if not fixture.exists(): + pytest.skip(f"fixture not available: {fixture}") + # Fixtures are ~small; a generous budget keeps this test decisive. + result = detect_spark_runtime(str(fixture), max_events_scanned=5000) + assert result.route is expected_route, result.reason + assert result.spark_runtime is expected_runtime, result.reason +``` + +- [ ] **Step 3: Run the fixture tests** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py -v` +Expected: all cases pass (or skip with a clear reason if a fixture is missing). + +- [ ] **Step 4: Commit** + +```bash +git add user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py +git commit -m "test(eventlog_detector): anchor parity tests on Scala fixtures" +``` + +--- + +## Task 11: Run the full test suite and linter + +**Files:** none modified. + +- [ ] **Step 1: Run every detector test** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/ -v` +Expected: all tests pass (ignore tests that skipped with "fixture not available"). + +- [ ] **Step 2: Run the full user_tools test suite to catch unintended regressions** + +Run: `pytest user_tools/tests/spark_rapids_tools_ut/ -x` +Expected: same pass/fail baseline as before this branch (nothing that was green turns red). + +- [ ] **Step 3: Run the project's linter against the new files** + +Run (adjust if the project uses a different runner): + +``` +pylint user_tools/src/spark_rapids_tools/tools/eventlog_detector/ \ + user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/ +``` + +Expected: no new errors or warnings above the project's existing baseline. If the project pins a specific pylint config in `pyproject.toml` or `.pylintrc`, read that file first and rerun with the matching options. If `pylint` isn't configured here, substitute the tool the repo actually uses (check `pyproject.toml` or `user_tools/dev/` for hooks). + +- [ ] **Step 4: Commit any docstring/type-hint fixups surfaced by the linter** + +If fixups were needed, stage only the files the linter touched and commit: + +```bash +git add -u +git commit -m "chore(eventlog_detector): resolve lint findings from full-suite run" +``` + +If no changes were required, skip the commit. + +--- + +## Task 12: Update plan doc with the realized inventory + +**Files:** + +- Modify: `docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md` + +- [ ] **Step 1: Replace the placeholder in Section 12 (Open items)** + +The spec currently says: + +``` +- Parity-test fixture inventory: enumerate every file under `core/src/test/resources/spark-events-*` and record the expected `SparkRuntime` label, derived from existing Scala test expectations. The plan step owns this list. +``` + +Replace with the actual list used in Task 10: + +``` +- Parity-test fixture inventory (realized): + - `spark-events-profiling/eventlog-gpu-dsv2.zstd` → `PROFILING` / `SPARK_RAPIDS` + - `spark-events-profiling/eventlog_dsv2.zstd` → `QUALIFICATION` / `SPARK` + - `spark-events-qualification/eventlog_same_app_id_1.zstd` → `QUALIFICATION` / `SPARK` +``` + +Use Edit with the old text as `old_string`. + +- [ ] **Step 2: Commit** + +```bash +git add docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +git commit -m "docs(spec): record realized fixture inventory for event log detector" +``` + +--- + +## Self-Review + +**Spec coverage:** + +- Sections 1–4 (problem, goal, non-goals, consumers) → no code; covered by the plan's Goal/Architecture header. +- Section 5 (public API) → Tasks 2, 3, 9 (re-exports, types, entry point). +- Section 6 (input shapes, codecs) → Tasks 6, 7. +- Section 7.1 (resolver) → Task 7. +- Section 7.2 (stream opener) → Task 6. +- Section 7.3 (scanner + termination) → Task 8. +- Section 7.4 (classifier) → Task 5. +- Section 7.5 (top-level flow + asymmetric rule) → Task 9. +- Section 7.6 (markers) → Task 4. +- Section 8 (classification rules) → Task 5 (classifier) + Task 4 (markers). +- Section 9 (error model) → Task 3 (types) + all tasks that raise. +- Section 10 (testing) → every task's test plus Task 10 (fixture anchor) and Task 11 (full-suite run). +- Section 11 (rollout) → this plan produces one PR. +- Section 12 open item (`zstandard` RELEASE.md note) → not addressed; treat as a follow-up for the release engineer. Section 12 open item (fixture inventory) → Task 12. +- Sections 13 (evolution) → doc-only, no code. +- Memory contract (section 7 header) → Task 6 (no `read()`/`readlines()`; streaming only) + Task 8 (no raw-event accumulation). +- Decision rule (section 2) → Task 9. + +**Placeholder scan:** every step shows exact paths, full file contents for new files, and specific commands. No "TODO", "fill in later", "similar to Task N", or bare "add appropriate handling" phrases. + +**Type/signature consistency:** + +- `Route`, `SparkRuntime`, `Termination`, `DetectionResult`, and the exception classes are defined in Task 3 and used verbatim in Tasks 5, 6, 7, 8, 9, 10 — same names and field orders throughout. +- `_resolve_event_log_files` (plural) is defined in Task 7 and called in Task 9. +- `_scan_events` / `_scan_events_across` signatures match between Task 8's implementation and Task 9's caller. +- `_open_event_log_stream` is a context manager in Task 6; Tasks 8 and 9 use it with `with`. +- `_ScanResult` fields (`spark_properties`, `app_id`, `app_name`, `spark_version`, `env_update_seen`, `events_scanned`, `termination`) are consistent between Task 8 and the read-sites in Task 9. + +No gaps found. \ No newline at end of file From c5f445ac6e8d781735ab395ddbc4b0493ec4ac15 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:36:21 -0700 Subject: [PATCH 07/29] build(user_tools): add zstandard dep for event log detector Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- user_tools/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml index 8772df3d2..efd7c56aa 100644 --- a/user_tools/pyproject.toml +++ b/user_tools/pyproject.toml @@ -85,6 +85,8 @@ dependencies = [ "scikit-learn==1.7.0", # used for retrieving available memory on the host "psutil==7.0.0", + # used to read zstd-compressed spark event logs + "zstandard==0.25.0", # pyspark for distributed computing "pyspark>=3.5.7,<4.0.0", # Jproperties used to handle Java properties file (added for the Tools API) From be1dedc3f96af5d277136e97da2539eb78dd5c94 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:52:52 -0700 Subject: [PATCH 08/29] feat(eventlog_detector): package skeleton and public re-exports Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/__init__.py | 60 +++++++++++++++++++ .../spark_rapids_tools_ut/tools/__init__.py | 15 +++++ .../tools/eventlog_detector/__init__.py | 15 +++++ 3 files changed, 90 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/__init__.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py new file mode 100644 index 000000000..023eb042d --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py @@ -0,0 +1,60 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Lightweight event log runtime detector. + +Public entry point: ``detect_spark_runtime``. Returns a ``DetectionResult`` +carrying a routing decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) +and best-effort metadata, by scanning a bounded prefix of a Spark event log. + +See docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +for the full contract and the Scala sources this mirrors. +""" + +from typing import Any + +__all__ = [ + "DetectionResult", + "EventLogDetectionError", + "EventLogReadError", + "Route", + "SparkRuntime", + "UnsupportedCompressionError", + "UnsupportedInputError", + "detect_spark_runtime", +] + +# Lazy re-exports: the submodules that back these names are added across +# subsequent tasks. Using ``__getattr__`` defers the import until the name +# is actually accessed, which keeps intermediate test suites importable +# while the package is being built out. +_TYPES_NAMES = { + "DetectionResult", + "EventLogDetectionError", + "EventLogReadError", + "Route", + "SparkRuntime", + "UnsupportedCompressionError", + "UnsupportedInputError", +} + + +def __getattr__(name: str) -> Any: + if name == "detect_spark_runtime": + from .detector import detect_spark_runtime as _fn + return _fn + if name in _TYPES_NAMES: + from . import types as _types + return getattr(_types, name) + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/__init__.py b/user_tools/tests/spark_rapids_tools_ut/tools/__init__.py new file mode 100644 index 000000000..51b351e5a --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""init file of the tools unit-tests package""" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py new file mode 100644 index 000000000..7088b6bc7 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""init file of the eventlog_detector unit-tests package""" From 997ef43aed8b64eef598214b745e1d37da34c1f9 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 17:52:59 -0700 Subject: [PATCH 09/29] feat(eventlog_detector): routing types and exception hierarchy Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/types.py | 81 ++++++++++++ .../tools/eventlog_detector/test_types.py | 120 ++++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py new file mode 100644 index 000000000..e6befad17 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py @@ -0,0 +1,81 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Types, enums, and exceptions for the event log runtime detector.""" + +from dataclasses import dataclass +from enum import Enum +from typing import Optional + + +class Route(str, Enum): + """Routing decision returned to the caller.""" + + QUALIFICATION = "QUALIFICATION" + PROFILING = "PROFILING" + UNKNOWN = "UNKNOWN" + + +class SparkRuntime(str, Enum): + """Runtime taxonomy. + + Values match ``org.apache.spark.sql.rapids.tool.util.SparkRuntime`` in the + Scala core so aether's ``JobRun.spark_runtime`` column stays compatible. + """ + + SPARK = "SPARK" + SPARK_RAPIDS = "SPARK_RAPIDS" + PHOTON = "PHOTON" + AURON = "AURON" + + +class Termination(Enum): + """How the scanner stopped. Used internally by the detector flow.""" + + DECISIVE = "DECISIVE" # classification returned non-SPARK + EXHAUSTED = "EXHAUSTED" # walked every file to EOF under the budget + CAP_HIT = "CAP_HIT" # hit max_events_scanned before exhausting files + + +@dataclass(frozen=True) +class DetectionResult: + """Result returned by ``detect_spark_runtime``. + + ``spark_runtime`` is best-effort metadata. ``None`` is valid (e.g., when + ``route`` is ``UNKNOWN`` because env-update was never seen). + """ + + route: Route + spark_runtime: Optional[SparkRuntime] + app_id: Optional[str] + spark_version: Optional[str] + event_log_path: str + source_path: str + reason: str + + +class EventLogDetectionError(Exception): + """Base class for detector errors.""" + + +class UnsupportedInputError(EventLogDetectionError): + """Input shape is outside V1 scope (multi-app dir, wildcard, comma list, ...).""" + + +class UnsupportedCompressionError(EventLogDetectionError): + """File uses a compression codec the V1 detector does not handle.""" + + +class EventLogReadError(EventLogDetectionError): + """Wraps an underlying I/O failure when reading the event log.""" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py new file mode 100644 index 000000000..e2c52f028 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py @@ -0,0 +1,120 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.types``.""" + +import pytest + +from spark_rapids_tools.tools.eventlog_detector.types import ( + DetectionResult, + EventLogDetectionError, + EventLogReadError, + Route, + SparkRuntime, + Termination, + UnsupportedCompressionError, + UnsupportedInputError, +) + + +class TestRoute: + def test_has_three_values(self): + assert {r.value for r in Route} == {"QUALIFICATION", "PROFILING", "UNKNOWN"} + + def test_is_string_enum(self): + # str subclass means aether can compare against plain strings. + assert Route.PROFILING == "PROFILING" + + +class TestSparkRuntime: + def test_values_match_scala_enum_exactly(self): + # These strings match org.apache.spark.sql.rapids.tool.util.SparkRuntime + # which aether already persists in JobRun.spark_runtime. + assert {r.value for r in SparkRuntime} == { + "SPARK", + "SPARK_RAPIDS", + "PHOTON", + "AURON", + } + + def test_is_string_enum(self): + assert SparkRuntime.SPARK_RAPIDS == "SPARK_RAPIDS" + + +class TestTermination: + def test_has_three_modes(self): + assert {t.name for t in Termination} == {"DECISIVE", "EXHAUSTED", "CAP_HIT"} + + +class TestDetectionResult: + def test_frozen_dataclass(self): + result = DetectionResult( + route=Route.PROFILING, + spark_runtime=SparkRuntime.SPARK_RAPIDS, + app_id="app-1", + spark_version="3.5.1", + event_log_path="/tmp/x", + source_path="/tmp/x", + reason="decisive: classified as SPARK_RAPIDS", + ) + # Python raises FrozenInstanceError (a subclass of AttributeError) + # when you try to assign to a field on a frozen dataclass. + with pytest.raises(AttributeError): + result.route = Route.UNKNOWN # type: ignore[misc] + + def test_structural_equality(self): + kwargs = dict( + route=Route.QUALIFICATION, + spark_runtime=SparkRuntime.SPARK, + app_id="a", + spark_version="3.5.1", + event_log_path="/tmp/a", + source_path="/tmp/a", + reason="walked full log, no GPU-family signal", + ) + assert DetectionResult(**kwargs) == DetectionResult(**kwargs) + assert hash(DetectionResult(**kwargs)) == hash(DetectionResult(**kwargs)) + # Distinct payloads compare unequal. + other = DetectionResult(**{**kwargs, "app_id": "b"}) + assert DetectionResult(**kwargs) != other + + def test_accepts_optional_fields_as_none(self): + result = DetectionResult( + route=Route.UNKNOWN, + spark_runtime=None, + app_id=None, + spark_version=None, + event_log_path="/tmp/x", + source_path="/tmp/x", + reason="no decisive signal within bounded scan", + ) + assert result.route is Route.UNKNOWN + assert result.spark_runtime is None + + +class TestExceptionHierarchy: + def test_all_errors_subclass_base(self): + for cls in ( + UnsupportedInputError, + UnsupportedCompressionError, + EventLogReadError, + ): + assert issubclass(cls, EventLogDetectionError) + + def test_base_is_exception_not_value_error(self): + # We deliberately do NOT inherit ValueError: EventLogReadError wraps + # I/O failures (missing file, permissions), which are not bad-input + # errors. Keeping the base at Exception avoids that semantic mismatch. + assert issubclass(EventLogDetectionError, Exception) + assert not issubclass(EventLogDetectionError, ValueError) From e18661e54643aae124fac366a897483cd1f061c7 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:00:22 -0700 Subject: [PATCH 10/29] feat(eventlog_detector): Scala-pinned markers module Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/markers.py | 89 +++++++++++++++++++ .../tools/eventlog_detector/test_markers.py | 85 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py new file mode 100644 index 000000000..bae35423f --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py @@ -0,0 +1,89 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Single source of truth for runtime-detection markers. + +Each constant below is pinned to a specific Scala source location. If +the Scala rule changes, update the constant here and the parity tests +under ``tests/spark_rapids_tools_ut/tools/eventlog_detector/`` will +catch any drift on a re-run. +""" + +from typing import Mapping, Tuple + +# --------------------------------------------------------------------------- +# SPARK_RAPIDS (GPU) markers +# Scala source: core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121 +# --------------------------------------------------------------------------- +GPU_PLUGIN_KEY: str = "spark.plugins" +GPU_PLUGIN_CLASS_SUBSTRING: str = "com.nvidia.spark.SQLPlugin" +GPU_ENABLED_KEY: str = "spark.rapids.sql.enabled" +# Default when GPU_ENABLED_KEY is missing or unparseable as bool. Matches +# Scala ``Try { ... }.getOrElse(true)`` in isPluginEnabled. +GPU_ENABLED_DEFAULT: bool = True + +# --------------------------------------------------------------------------- +# AURON markers +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172 +# --------------------------------------------------------------------------- +AURON_SPARK_EXTENSIONS_KEY: str = "spark.sql.extensions" +AURON_EXTENSION_REGEX: str = r".*AuronSparkSessionExtension.*" +AURON_ENABLED_KEY: str = "spark.auron.enabled" +AURON_ENABLED_DEFAULT: str = "true" + +# --------------------------------------------------------------------------- +# Databricks precondition (all three keys must be non-empty) +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58 +# and DatabricksParseHelper.scala:188-190 +# --------------------------------------------------------------------------- +DB_PRECONDITION_KEYS: Tuple[str, str, str] = ( + "spark.databricks.clusterUsageTags.clusterAllTags", + "spark.databricks.clusterUsageTags.clusterId", + "spark.databricks.clusterUsageTags.clusterName", +) + +# --------------------------------------------------------------------------- +# PHOTON markers (any one fullmatches once Databricks precondition holds) +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151 +# --------------------------------------------------------------------------- +PHOTON_MARKER_REGEX: Mapping[str, str] = { + "spark.databricks.clusterUsageTags.sparkVersion": r".*-photon-.*", + "spark.databricks.clusterUsageTags.effectiveSparkVersion": r".*-photon-.*", + "spark.databricks.clusterUsageTags.sparkImageLabel": r".*-photon-.*", + "spark.databricks.clusterUsageTags.runtimeEngine": r"PHOTON", +} + +# --------------------------------------------------------------------------- +# Databricks rolling event-log file layout +# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:57 +# and :458-478 (date parse in getDBEventLogFileDate) +# --------------------------------------------------------------------------- +DB_EVENT_LOG_FILE_PREFIX: str = "eventlog" +# Matches the dated form ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` used by +# ``DatabricksRollingEventLogFilesFileReader``. Bare ``eventlog`` has no +# match and is treated as "latest" (sorted last) by the resolver. +DB_EVENT_LOG_DATE_REGEX: str = ( + r"^eventlog-(\d{4})-(\d{2})-(\d{2})--(\d{2})-(\d{2})(?:\.[A-Za-z0-9]+)?$" +) + +# --------------------------------------------------------------------------- +# Supported Spark listener event names +# --------------------------------------------------------------------------- +EVENT_LOG_START: str = "SparkListenerLogStart" +EVENT_APPLICATION_START: str = "SparkListenerApplicationStart" +EVENT_ENVIRONMENT_UPDATE: str = "SparkListenerEnvironmentUpdate" +EVENT_SQL_EXECUTION_START: str = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" +# Spark's actual SQLExecutionStart class name carries a package prefix in +# event logs. The unqualified shortname is sometimes used in test fixtures. +EVENT_SQL_EXECUTION_START_SHORTNAME: str = "SparkListenerSQLExecutionStart" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py new file mode 100644 index 000000000..3f225df94 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py @@ -0,0 +1,85 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.markers``.""" + +import re + +from spark_rapids_tools.tools.eventlog_detector import markers as m + + +class TestGpuMarkers: + def test_plugin_substring_exact(self): + assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" + + def test_gpu_toggle_key_exact(self): + assert m.GPU_ENABLED_KEY == "spark.rapids.sql.enabled" + + +class TestAuronMarkers: + def test_extension_regex_fullmatches_expected_value(self): + # Mirrors AuronParseHelper.extensionRegxMap. + pat = re.compile(m.AURON_EXTENSION_REGEX) + assert pat.fullmatch("org.apache.spark.sql.AuronSparkSessionExtension") + assert pat.fullmatch("whatever.AuronSparkSessionExtension.more") + assert not pat.fullmatch("org.apache.spark.sql.SomeOtherExtension") + + def test_auron_enabled_defaults_to_true(self): + assert m.AURON_ENABLED_DEFAULT == "true" + assert m.AURON_ENABLED_KEY == "spark.auron.enabled" + assert m.AURON_SPARK_EXTENSIONS_KEY == "spark.sql.extensions" + + +class TestDatabricksPrecondition: + def test_all_three_tag_keys_present(self): + assert m.DB_PRECONDITION_KEYS == ( + "spark.databricks.clusterUsageTags.clusterAllTags", + "spark.databricks.clusterUsageTags.clusterId", + "spark.databricks.clusterUsageTags.clusterName", + ) + + +class TestPhotonMarkers: + def test_marker_map_fullmatches_expected(self): + pats = {k: re.compile(v) for k, v in m.PHOTON_MARKER_REGEX.items()} + assert pats[ + "spark.databricks.clusterUsageTags.sparkVersion" + ].fullmatch("11.3.x-photon-scala2.12") + assert pats[ + "spark.databricks.clusterUsageTags.runtimeEngine" + ].fullmatch("PHOTON") + assert not pats[ + "spark.databricks.clusterUsageTags.runtimeEngine" + ].fullmatch("STANDARD") + + def test_all_four_photon_keys(self): + assert set(m.PHOTON_MARKER_REGEX) == { + "spark.databricks.clusterUsageTags.sparkVersion", + "spark.databricks.clusterUsageTags.effectiveSparkVersion", + "spark.databricks.clusterUsageTags.sparkImageLabel", + "spark.databricks.clusterUsageTags.runtimeEngine", + } + + +class TestDatabricksRollingFileName: + def test_prefix_is_eventlog(self): + assert m.DB_EVENT_LOG_FILE_PREFIX == "eventlog" + + def test_date_pattern_parses_scala_format(self): + pat = re.compile(m.DB_EVENT_LOG_DATE_REGEX) + # Scala's getDBEventLogFileDate splits on '--' and parses + # 'eventlog-YYYY-MM-DD--HH-MM[.codec]'. + assert pat.search("eventlog-2021-06-14--20-00.gz") + assert pat.search("eventlog-2021-06-14--20-00") + assert not pat.search("eventlog") # bare eventlog has no date From f40ad36cae75ce1982e5505e28d8152d717f94fb Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:03:41 -0700 Subject: [PATCH 11/29] feat(eventlog_detector): classifier mirroring Scala priority Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/classifier.py | 85 ++++++++++++ .../eventlog_detector/test_classifier.py | 129 ++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py new file mode 100644 index 000000000..a67b2f97b --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -0,0 +1,85 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pure-function runtime classifier. + +``_classify_runtime`` maps a merged Spark properties dict to a +``SparkRuntime``. Priority order when multiple markers are present: +``PHOTON > AURON > SPARK_RAPIDS > SPARK``. This is a deterministic Python +choice; Scala's plugin iteration order is undefined when multiple +plugins claim a runtime, but in practice markers do not overlap. +""" + +import re +from typing import Mapping + +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime + + +_TRUE_STRINGS = {"true", "1", "yes"} +_FALSE_STRINGS = {"false", "0", "no"} + + +def _parse_bool(raw: str, default: bool) -> bool: + """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``.""" + stripped = raw.strip().lower() + if stripped in _TRUE_STRINGS: + return True + if stripped in _FALSE_STRINGS: + return False + return default + + +def _is_spark_rapids(props: Mapping[str, str]) -> bool: + plugins = props.get(m.GPU_PLUGIN_KEY, "") + if m.GPU_PLUGIN_CLASS_SUBSTRING not in plugins: + return False + raw = props.get(m.GPU_ENABLED_KEY) + if raw is None: + return m.GPU_ENABLED_DEFAULT + return _parse_bool(raw, default=m.GPU_ENABLED_DEFAULT) + + +def _is_auron(props: Mapping[str, str]) -> bool: + extensions = props.get(m.AURON_SPARK_EXTENSIONS_KEY) + if extensions is None or not re.fullmatch(m.AURON_EXTENSION_REGEX, extensions): + return False + enabled_raw = props.get(m.AURON_ENABLED_KEY, m.AURON_ENABLED_DEFAULT) + return enabled_raw.strip().lower() == m.AURON_ENABLED_DEFAULT + + +def _is_databricks(props: Mapping[str, str]) -> bool: + return all(props.get(k, "").strip() for k in m.DB_PRECONDITION_KEYS) + + +def _is_photon(props: Mapping[str, str]) -> bool: + if not _is_databricks(props): + return False + for key, pattern in m.PHOTON_MARKER_REGEX.items(): + value = props.get(key) + if value is not None and re.fullmatch(pattern, value): + return True + return False + + +def _classify_runtime(props: Mapping[str, str]) -> SparkRuntime: + # Priority: PHOTON > AURON > SPARK_RAPIDS > SPARK. + if _is_photon(props): + return SparkRuntime.PHOTON + if _is_auron(props): + return SparkRuntime.AURON + if _is_spark_rapids(props): + return SparkRuntime.SPARK_RAPIDS + return SparkRuntime.SPARK diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py new file mode 100644 index 000000000..f3868fe28 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -0,0 +1,129 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.classifier``.""" + +import pytest + +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime + + +class TestEmptyProperties: + def test_empty_props_is_spark(self): + assert _classify_runtime({}) is SparkRuntime.SPARK + + +class TestSparkRapids: + def test_plugin_and_default_enabled(self): + props = {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin,bar"} + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + def test_plugin_with_enabled_true(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "true", + } + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + def test_plugin_with_enabled_false_demotes_to_spark(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + } + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_enabled_true_without_plugin_is_still_spark(self): + props = {"spark.rapids.sql.enabled": "true"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_unparseable_enabled_defaults_to_true(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "not-a-bool", + } + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + + +class TestAuron: + def test_extension_and_default_enabled(self): + props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} + assert _classify_runtime(props) is SparkRuntime.AURON + + def test_extension_and_enabled_false_demotes_to_spark(self): + props = { + "spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension", + "spark.auron.enabled": "FALSE", + } + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_auron_enabled_case_insensitive(self): + props = { + "spark.sql.extensions": "AuronSparkSessionExtension", + "spark.auron.enabled": " TrUe ", + } + assert _classify_runtime(props) is SparkRuntime.AURON + + +class TestDatabricksPhoton: + @pytest.fixture + def db_precond_props(self): + return { + "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", + "spark.databricks.clusterUsageTags.clusterId": "1234", + "spark.databricks.clusterUsageTags.clusterName": "dev-cluster", + } + + def test_precondition_only_is_spark(self, db_precond_props): + assert _classify_runtime(db_precond_props) is SparkRuntime.SPARK + + def test_precondition_plus_photon_version(self, db_precond_props): + props = { + **db_precond_props, + "spark.databricks.clusterUsageTags.sparkVersion": "11.3.x-photon-scala2.12", + } + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_precondition_plus_photon_engine(self, db_precond_props): + props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_photon_marker_without_precondition_is_spark(self): + props = {"spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + def test_photon_engine_other_value_is_spark(self, db_precond_props): + props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "STANDARD"} + assert _classify_runtime(props) is SparkRuntime.SPARK + + +class TestPriority: + """PHOTON > AURON > SPARK_RAPIDS > SPARK when markers coexist.""" + + def test_photon_beats_spark_rapids(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", + "spark.databricks.clusterUsageTags.clusterId": "1", + "spark.databricks.clusterUsageTags.clusterName": "c", + "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON", + } + assert _classify_runtime(props) is SparkRuntime.PHOTON + + def test_auron_beats_spark_rapids(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.sql.extensions": "AuronSparkSessionExtension", + } + assert _classify_runtime(props) is SparkRuntime.AURON From 265f576f87aa13594c83e301d926b6bdd00ecaaa Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:08:03 -0700 Subject: [PATCH 12/29] feat(eventlog_detector): context-managed codec-aware line streamer Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/stream.py | 110 +++++++++++++++ .../tools/eventlog_detector/test_stream.py | 130 ++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py new file mode 100644 index 000000000..c78a8b0e7 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -0,0 +1,110 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Codec-aware context-managed line streamer for Spark event logs. + +Opens the file via ``CspPath.open_input_stream()``, applies the right +decompression layer based on extension, wraps in a text decoder, and +yields an ``Iterator[str]``. On exit the context manager closes every +layer in reverse order. Streaming only — no buffering of the full file. + +PyArrow coupling: ``CspPath.open_input_stream()`` delegates to PyArrow's +filesystem API, which auto-detects and decompresses ``.gz`` and ``.zst`` +files transparently. ``.zstd`` is not recognised by PyArrow, so this +module decompresses it manually via ``zstandard``. If a future PyArrow +release changes its codec detection, this suffix mapping must be +re-verified. +""" + +import contextlib +import io +from typing import Iterator + +import zstandard as zstd + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.types import ( + EventLogReadError, + UnsupportedCompressionError, +) + + +# PyArrow's ``open_input_stream()`` transparently decompresses files whose +# extension matches a codec it recognises (including ``.gz`` and ``.zst``). +# For these we can read the already-decompressed byte stream directly. +_PYARROW_AUTO_DECOMP_SUFFIXES = {".gz", ".zst"} +# PyArrow does NOT recognise ``.zstd`` as a codec suffix, so the byte stream +# is raw compressed data that we must decompress ourselves. +_ZSTD_MANUAL_SUFFIXES = {".zstd"} +# Suffixes that unambiguously indicate a codec we do not support in V1. +_UNSUPPORTED_CODEC_SUFFIXES = {".lz4", ".lzf", ".snappy"} + +def _classify_suffix(path: CspPath) -> str: + name = path.base_name().lower() + dot = name.rfind(".") + if dot < 0: + return "" + return name[dot:] + + +@contextlib.contextmanager +def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: + suffix = _classify_suffix(path) + if suffix in _UNSUPPORTED_CODEC_SUFFIXES: + raise UnsupportedCompressionError( + f"Compression codec '{suffix}' is not supported by the lightweight " + "event log detector. Fall back to the full qualification/profiling " + "pipeline for this log." + ) + + try: + byte_stream = path.open_input_stream() + except Exception as exc: + raise EventLogReadError(f"Failed to open event log {path}: {exc}") from exc + + close_stack = contextlib.ExitStack() + close_stack.callback(byte_stream.close) + try: + if suffix in _ZSTD_MANUAL_SUFFIXES: + # PyArrow does not recognise ``.zstd``, so the byte stream holds + # raw compressed frames — decompress them with the zstandard library. + dctx = zstd.ZstdDecompressor() + decompressed: io.RawIOBase = dctx.stream_reader(byte_stream) + close_stack.callback(decompressed.close) + else: + # For ``.gz``, ``.zst``, ``.inprogress``, and unknown/plain + # suffixes, PyArrow already handles decompression (or there is + # nothing to decompress). Pass the byte stream straight through. + # If the file is actually compressed with an unknown codec the + # scanner will see garbled lines that don't parse as JSON; those + # will be skipped and the caller will see Route.UNKNOWN — the + # correct failure mode for this lightweight path. + decompressed = byte_stream + + text = io.TextIOWrapper(decompressed, encoding="utf-8", errors="replace", newline="") + close_stack.callback(text.close) + + def line_iter() -> Iterator[str]: + for raw in text: + # Strip the trailing newline to match the "one event per line" + # contract. Empty lines are legal and skipped by the caller. + yield raw.rstrip("\r\n") + + try: + yield line_iter() + except Exception as exc: + # Convert any read-time I/O error into a typed domain error. + raise EventLogReadError(f"Error reading event log {path}: {exc}") from exc + finally: + close_stack.close() diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py new file mode 100644 index 000000000..07cfdfe9c --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -0,0 +1,130 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.stream``.""" + +import gzip +from pathlib import Path + +import pytest +import zstandard as zstd + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.types import ( + EventLogReadError, + UnsupportedCompressionError, +) + + +SAMPLE_LINES = [ + '{"Event":"SparkListenerLogStart","Spark Version":"3.5.1"}', + '{"Event":"SparkListenerApplicationStart","App ID":"app-1"}', + '{"Event":"SparkListenerEnvironmentUpdate","Spark Properties":{}}', +] + + +def _write_plain(path: Path) -> None: + path.write_text("\n".join(SAMPLE_LINES) + "\n", encoding="utf-8") + + +def _write_gz(path: Path) -> None: + with gzip.open(path, "wt", encoding="utf-8") as fh: + fh.write("\n".join(SAMPLE_LINES) + "\n") + + +def _write_zstd(path: Path) -> None: + cctx = zstd.ZstdCompressor() + raw = ("\n".join(SAMPLE_LINES) + "\n").encode("utf-8") + path.write_bytes(cctx.compress(raw)) + + +@pytest.fixture +def plain_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.inprogress" + _write_plain(p) + return CspPath(str(p)) + + +@pytest.fixture +def gz_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.gz" + _write_gz(p) + return CspPath(str(p)) + + +@pytest.fixture +def zstd_file(tmp_path: Path) -> CspPath: + p = tmp_path / "eventlog.zstd" + _write_zstd(p) + return CspPath(str(p)) + + +class TestPlainStream: + def test_yields_all_lines(self, plain_file): + with _open_event_log_stream(plain_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestGzipStream: + def test_yields_all_lines(self, gz_file): + with _open_event_log_stream(gz_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestZstdStream: + def test_yields_all_lines(self, zstd_file): + with _open_event_log_stream(zstd_file) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + def test_zst_short_suffix_also_works(self, tmp_path): + p = tmp_path / "eventlog.zst" + _write_zstd(p) + with _open_event_log_stream(CspPath(str(p))) as lines: + collected = [ln for ln in lines] + assert collected == SAMPLE_LINES + + +class TestUnsupportedCompression: + def test_lz4_raises(self, tmp_path): + p = tmp_path / "eventlog.lz4" + p.write_bytes(b"not-real-lz4") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + def test_snappy_raises(self, tmp_path): + p = tmp_path / "eventlog.snappy" + p.write_bytes(b"not-real-snappy") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + def test_lzf_raises(self, tmp_path): + p = tmp_path / "eventlog.lzf" + p.write_bytes(b"not-real-lzf") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + + +class TestIoFailure: + def test_missing_file_raises_read_error(self, tmp_path): + p = tmp_path / "does-not-exist" + with pytest.raises(EventLogReadError): + with _open_event_log_stream(CspPath(str(p))) as lines: + next(iter(lines)) From 8c4449da1158aab9961102071572cd92b66e0186 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:15:15 -0700 Subject: [PATCH 13/29] feat(eventlog_detector): single-file and Databricks rolling-dir resolver Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/resolver.py | 96 +++++++++++++++++ .../tools/eventlog_detector/test_resolver.py | 102 ++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py new file mode 100644 index 000000000..f5f448f46 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py @@ -0,0 +1,96 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Input-path resolution for the event log detector. + +Resolves one user-supplied path into a list of one or more concrete files +to read, in scan order. Supports a single file or a Databricks rolling +directory. Spark-native rolling, generic multi-app directories, wildcards, +and comma lists raise ``UnsupportedInputError`` and are expected to fall +back to the full Scala pipeline. +""" + +import re +from datetime import datetime +from typing import List, Optional, Tuple + +from spark_rapids_tools.storagelib import CspFs, CspPath +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError + + +_DB_DATE_PATTERN = re.compile(m.DB_EVENT_LOG_DATE_REGEX) + + +def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: + """Parse ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` to a datetime. + + Returns ``None`` for bare ``eventlog`` and any name that does not match + the dated pattern. The caller sorts ``None`` as "latest/current" to + mirror Scala's ``getDBEventLogFileDate`` which defaults to ``now()``. + """ + if not name.startswith(m.DB_EVENT_LOG_FILE_PREFIX): + return None + match = _DB_DATE_PATTERN.match(name) + if match is None: + return None + year, month, day, hour, minute = (int(g) for g in match.groups()) + return datetime(year, month, day, hour, minute) + + +def _is_databricks_event_log_filename(name: str) -> bool: + return name.startswith(m.DB_EVENT_LOG_FILE_PREFIX) + + +def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: + """Resolve ``path`` to an ordered list of files to scan. + + Returns ``(source, files)`` where ``source`` is the original input + rendered as a string (preserved for the ``DetectionResult``) and + ``files`` is the scan order. + """ + source = path.no_scheme + + if path.is_file(): + return source, [path] + + if not path.is_dir(): + raise UnsupportedInputError( + f"Path is neither a file nor a directory: {source}" + ) + + # Directory: must be a Databricks rolling dir. Spark-native rolling + # (eventlog_v2_*) and generic multi-app directories are out of scope. + children = CspFs.list_all_files(path) + db_files = [c for c in children if _is_databricks_event_log_filename(c.base_name())] + if not db_files: + raise UnsupportedInputError( + f"Directory {source} is not a supported input shape. The detector " + "handles single files or Databricks rolling directories only; fall " + "back to the full pipeline for Spark-native rolling, multi-app " + "directories, wildcards, or comma-separated inputs." + ) + + # Sort mirroring DatabricksRollingEventLogFilesFileReader: dated files + # ascending by parsed datetime, bare `eventlog` last (treated as + # "latest/current"). Stable sort on filename first to keep ordering + # deterministic among equal-date files (extremely unlikely in practice + # but cheap insurance for tests). + db_files.sort(key=lambda f: f.base_name()) + db_files.sort( + key=lambda f: ( + _parse_databricks_file_datetime(f.base_name()) or datetime.max, + ) + ) + return source, db_files diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py new file mode 100644 index 000000000..5cd0b770c --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -0,0 +1,102 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.resolver``.""" + +from datetime import datetime +from pathlib import Path + +import pytest + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.resolver import ( + _parse_databricks_file_datetime, + _resolve_event_log_files, +) +from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError + + +class TestDatabricksDateParse: + def test_bare_eventlog_is_latest_sentinel(self): + # Returns None; caller treats None as "sort last". + assert _parse_databricks_file_datetime("eventlog") is None + + def test_dated_file_parses(self): + dt = _parse_databricks_file_datetime("eventlog-2021-06-14--20-00.gz") + assert dt == datetime(2021, 6, 14, 20, 0) + + def test_dated_file_without_codec_parses(self): + dt = _parse_databricks_file_datetime("eventlog-2022-01-02--03-04") + assert dt == datetime(2022, 1, 2, 3, 4) + + def test_non_eventlog_prefix_returns_none(self): + assert _parse_databricks_file_datetime("application_1234.log") is None + + +class TestResolveSingleFile: + def test_single_file_returns_single_element_list(self, tmp_path: Path): + f = tmp_path / "eventlog.zstd" + f.write_bytes(b"x") + source, files = _resolve_event_log_files(CspPath(str(f))) + assert source == str(f) + assert [p.base_name() for p in files] == ["eventlog.zstd"] + + +class TestResolveDatabricksRollingDir: + def test_orders_earliest_first_and_bare_eventlog_last(self, tmp_path: Path): + d = tmp_path / "dbrolling" + d.mkdir() + (d / "eventlog").write_bytes(b"") + (d / "eventlog-2021-06-14--20-00.gz").write_bytes(b"") + (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") + source, files = _resolve_event_log_files(CspPath(str(d))) + assert source == str(d) + names = [p.base_name() for p in files] + # Earliest dated file first; bare `eventlog` sorts last (treated as + # "current/latest" per Scala). + assert names == [ + "eventlog-2021-06-14--18-00.gz", + "eventlog-2021-06-14--20-00.gz", + "eventlog", + ] + + def test_dir_with_no_eventlog_prefix_raises(self, tmp_path: Path): + d = tmp_path / "empty" + d.mkdir() + (d / "application_1.log").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + def test_empty_dir_raises(self, tmp_path: Path): + d = tmp_path / "blank" + d.mkdir() + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + +class TestResolveUnsupportedShapes: + def test_spark_native_rolling_dir_raises(self, tmp_path: Path): + d = tmp_path / "eventlog_v2_local-1623876083964" + d.mkdir() + (d / "events_1_local-1623876083964").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) + + def test_generic_multi_app_dir_raises(self, tmp_path: Path): + d = tmp_path / "multi" + d.mkdir() + (d / "app-1.zstd").write_bytes(b"") + (d / "app-2.zstd").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + _resolve_event_log_files(CspPath(str(d))) From 5d49e103c213486816e516e0201cc43b24354896 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:19:50 -0700 Subject: [PATCH 14/29] feat(eventlog_detector): bounded streaming scanner across files Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/scanner.py | 128 +++++++++++ .../tools/eventlog_detector/test_scanner.py | 198 ++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py new file mode 100644 index 000000000..c695cea02 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py @@ -0,0 +1,128 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Bounded streaming event scanner. + +Walks JSON-per-line event logs under a shared event budget, merges +properties from startup and per-SQL events into a single mutable dict, +and reports early-stop/exhausted/cap-hit termination. Strictly streaming: +no slurping, no accumulating raw events. +""" + +import json +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector import markers as m +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, Termination + + +@dataclass +class _ScanResult: + spark_properties: Dict[str, str] = field(default_factory=dict) + app_id: Optional[str] = None + app_name: Optional[str] = None + spark_version: Optional[str] = None + env_update_seen: bool = False + events_scanned: int = 0 + termination: Termination = Termination.EXHAUSTED + + +def _scan_events( + lines: Iterable[str], + *, + budget: int, + state: Optional[_ScanResult] = None, +) -> _ScanResult: + """Scan one stream of lines, optionally continuing from a prior state. + + Returns the updated ``_ScanResult``. Terminates as soon as classification + turns non-SPARK (``DECISIVE``), or when the budget is exhausted + (``CAP_HIT``), or when ``lines`` is fully consumed (``EXHAUSTED``). + """ + result = state if state is not None else _ScanResult() + + for raw in lines: + if result.events_scanned >= budget: + result.termination = Termination.CAP_HIT + return result + + if not raw: + continue + + try: + event = json.loads(raw) + except (json.JSONDecodeError, ValueError): + # Malformed JSON line (Spark tolerates trailing partials). Count + # it against the budget so a pathological log can't stall us. + result.events_scanned += 1 + continue + + result.events_scanned += 1 + name = event.get("Event") + if name == m.EVENT_LOG_START: + version = event.get("Spark Version") + if isinstance(version, str): + result.spark_version = version + elif name == m.EVENT_APPLICATION_START: + app_id = event.get("App ID") + app_name = event.get("App Name") + if isinstance(app_id, str): + result.app_id = app_id + if isinstance(app_name, str): + result.app_name = app_name + elif name == m.EVENT_ENVIRONMENT_UPDATE: + props = event.get("Spark Properties") or {} + if isinstance(props, dict): + for k, v in props.items(): + if isinstance(k, str) and isinstance(v, str): + result.spark_properties[k] = v + result.env_update_seen = True + if _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK: + result.termination = Termination.DECISIVE + return result + elif name in (m.EVENT_SQL_EXECUTION_START, m.EVENT_SQL_EXECUTION_START_SHORTNAME): + modified = event.get("modifiedConfigs") or {} + if isinstance(modified, dict) and modified: + for k, v in modified.items(): + if isinstance(k, str) and isinstance(v, str): + result.spark_properties[k] = v + if result.env_update_seen and ( + _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK + ): + result.termination = Termination.DECISIVE + return result + + # Fully consumed without early-stop or budget exhaustion. + result.termination = Termination.EXHAUSTED + return result + + +def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: + """Walk ``files`` in order under a single shared ``budget``.""" + state = _ScanResult() + for path in files: + if state.events_scanned >= budget: + state.termination = Termination.CAP_HIT + return state + with _open_event_log_stream(path) as lines: + state = _scan_events(lines, budget=budget, state=state) + if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): + return state + # All files consumed. + state.termination = Termination.EXHAUSTED + return state diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py new file mode 100644 index 000000000..5658f17b6 --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py @@ -0,0 +1,198 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for ``eventlog_detector.scanner``.""" + +from pathlib import Path +from typing import List + +import pytest + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.scanner import ( + _scan_events, + _scan_events_across, +) +from spark_rapids_tools.tools.eventlog_detector.types import ( + SparkRuntime, + Termination, +) + + +# ---------- Line builders ---------- + +def env_update(props: dict) -> str: + import json + + return json.dumps( + { + "Event": "SparkListenerEnvironmentUpdate", + "Spark Properties": props, + "System Properties": {}, + "Classpath Entries": {}, + "JVM Information": {}, + } + ) + + +def log_start(version: str = "3.5.1") -> str: + import json + + return json.dumps({"Event": "SparkListenerLogStart", "Spark Version": version}) + + +def app_start(app_id: str = "app-1", app_name: str = "App") -> str: + import json + + return json.dumps( + { + "Event": "SparkListenerApplicationStart", + "App ID": app_id, + "App Name": app_name, + } + ) + + +def sql_exec_start(modified_configs: dict) -> str: + import json + + return json.dumps( + { + "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", + "executionId": 0, + "description": "x", + "details": "", + "physicalPlanDescription": "", + "sparkPlanInfo": {}, + "time": 0, + "modifiedConfigs": modified_configs, + } + ) + + +# ---------- Tests for _scan_events (single stream) ---------- + +class TestScanEvents: + def test_env_update_with_gpu_is_decisive(self): + lines = iter( + [ + log_start(), + app_start(), + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.app_id == "app-1" + assert result.spark_version == "3.5.1" + assert result.termination is Termination.DECISIVE + + def test_env_update_cpu_then_sql_start_flips_to_gpu(self): + lines = iter( + [ + log_start(), + app_start(), + env_update({"spark.master": "local"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.DECISIVE + + def test_cpu_only_to_eof_is_exhausted(self): + lines = iter([log_start(), app_start(), env_update({"spark.master": "local"})]) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.termination is Termination.EXHAUSTED + + def test_no_env_update_within_budget_is_cap_hit(self): + # Budget less than the number of events, none of them env-update. + lines = iter([log_start()] * 5) + result = _scan_events(lines, budget=2) + assert result.env_update_seen is False + assert result.termination is Termination.CAP_HIT + + def test_no_env_update_to_eof_is_exhausted_without_env(self): + lines = iter([log_start(), app_start()]) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is False + assert result.termination is Termination.EXHAUSTED + + def test_malformed_json_lines_are_skipped(self): + lines = iter( + [ + "not-json-at-all", + log_start(), + "", + app_start(), + env_update({"spark.master": "local"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.env_update_seen is True + assert result.app_id == "app-1" + + def test_later_sql_start_overwrites_earlier_property(self): + # last-write-wins merge, matching CacheablePropsHandler.mergeModifiedConfigs. + lines = iter( + [ + env_update({"spark.rapids.sql.enabled": "false", "spark.plugins": "com.nvidia.spark.SQLPlugin"}), + sql_exec_start({"spark.rapids.sql.enabled": "true"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.DECISIVE + # Final accumulated props reflect the merge. + assert result.spark_properties["spark.rapids.sql.enabled"] == "true" + + +# ---------- Tests for _scan_events_across (multi-file) ---------- + +def _write(path: Path, lines: List[str]) -> CspPath: + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return CspPath(str(path)) + + +class TestScanEventsAcross: + def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): + f1 = _write( + tmp_path / "eventlog-2021-06-14--18-00", + [log_start(), app_start(), env_update({"spark.master": "local"})], + ) + f2 = _write( + tmp_path / "eventlog-2021-06-14--20-00", + [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})], + ) + result = _scan_events_across([f1, f2], budget=100) + assert result.termination is Termination.DECISIVE + + def test_shared_budget_applied_across_files(self, tmp_path): + # 3 events in first file, 3 in second. Budget = 4. Second file stops + # after one event, before any GPU signal. + f1 = _write(tmp_path / "a", [log_start(), app_start(), env_update({"spark.master": "local"})]) + f2 = _write( + tmp_path / "b", + [ + sql_exec_start({"spark.master": "still-cpu"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + sql_exec_start({"x": "y"}), + ], + ) + result = _scan_events_across([f1, f2], budget=4) + assert result.termination is Termination.CAP_HIT + + def test_all_files_exhausted_returns_exhausted(self, tmp_path): + f1 = _write(tmp_path / "a", [env_update({"spark.master": "local"})]) + result = _scan_events_across([f1], budget=100) + assert result.termination is Termination.EXHAUSTED From 60dc011d72731a104f073e40971510d3d28b66fc Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:23:51 -0700 Subject: [PATCH 15/29] feat(eventlog_detector): top-level detect_spark_runtime entry point Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/detector.py | 96 +++++++++++ .../tools/eventlog_detector/test_detector.py | 162 ++++++++++++++++++ 2 files changed, 258 insertions(+) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py new file mode 100644 index 000000000..e1b9fd91f --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py @@ -0,0 +1,96 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Top-level event log runtime detector. + +``detect_spark_runtime(path)`` returns a ``DetectionResult`` carrying the +routing decision and best-effort metadata. On inconclusive input, returns +``Route.UNKNOWN`` rather than raising; callers fall back to the full +Scala pipeline in that case. +""" + +from typing import Optional, Union + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.resolver import _resolve_event_log_files +from spark_rapids_tools.tools.eventlog_detector.scanner import _scan_events_across +from spark_rapids_tools.tools.eventlog_detector.types import ( + DetectionResult, + Route, + SparkRuntime, + Termination, +) + + +_GPU_FAMILY = frozenset({SparkRuntime.SPARK_RAPIDS, SparkRuntime.PHOTON, SparkRuntime.AURON}) + + +def detect_spark_runtime( + event_log: Union[str, CspPath], + *, + max_events_scanned: int = 500, +) -> DetectionResult: + """Classify a single-app event log into a routing decision. + + Returns ``DetectionResult`` with: + + * ``route`` = ``PROFILING`` for any decisive non-SPARK classification, + * ``QUALIFICATION`` only after the scanner walked the full log with + no GPU-family signal, + * ``UNKNOWN`` when the event budget was hit first or + ``SparkListenerEnvironmentUpdate`` was never seen. + + ``max_events_scanned`` caps CPU/IO cost; large CPU logs routinely end + as ``UNKNOWN`` at the cap. Raise the cap at the call site to trade + cost for decisiveness. + """ + # Preserve the original user-supplied string in the result's source_path + # so callers see their input back unchanged (including cloud URI schemes). + source_path = event_log if isinstance(event_log, str) else str(event_log) + path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) + _, files = _resolve_event_log_files(path) + + scan = _scan_events_across(files, budget=max_events_scanned) + + runtime: Optional[SparkRuntime] + if scan.env_update_seen: + runtime = _classify_runtime(scan.spark_properties) + else: + runtime = None + + if runtime in _GPU_FAMILY: + route = Route.PROFILING + reason = f"decisive: classified as {runtime.value}" + elif scan.termination is Termination.EXHAUSTED and scan.env_update_seen: + route = Route.QUALIFICATION + reason = "walked full log, no GPU-family signal" + else: + route = Route.UNKNOWN + reason = ( + "no decisive signal within bounded scan" + if scan.env_update_seen + else "no SparkListenerEnvironmentUpdate reached" + ) + + resolved_path = str(files[0]) if files else source_path + return DetectionResult( + route=route, + spark_runtime=runtime, + app_id=scan.app_id, + spark_version=scan.spark_version, + event_log_path=resolved_path, + source_path=source_path, + reason=reason, + ) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py new file mode 100644 index 000000000..77c16998d --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -0,0 +1,162 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests for ``eventlog_detector.detect_spark_runtime``.""" + +import json +from pathlib import Path + +import pytest + +from spark_rapids_tools.storagelib import CspPath +from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime +from spark_rapids_tools.tools.eventlog_detector.types import ( + Route, + SparkRuntime, + UnsupportedInputError, +) + + +def env_update(props: dict) -> dict: + return { + "Event": "SparkListenerEnvironmentUpdate", + "Spark Properties": props, + "System Properties": {}, + "Classpath Entries": {}, + "JVM Information": {}, + } + + +def _write_plain_log(path: Path, events: list) -> None: + path.write_text( + "\n".join(json.dumps(e) for e in events) + "\n", encoding="utf-8" + ) + + +class TestAcceptsStringPath: + def test_str_input_resolves(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "a", "App Name": "A"}, + env_update({"spark.master": "local"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert result.route is Route.QUALIFICATION + assert result.spark_runtime is SparkRuntime.SPARK + + +class TestGpuLog: + def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "g", "App Name": "G"}, + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) + result = detect_spark_runtime(CspPath(str(log))) + assert result.route is Route.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + assert result.app_id == "g" + assert result.spark_version == "3.5.1" + + +class TestCapHit: + def test_no_env_update_before_cap_is_unknown(self, tmp_path): + log = tmp_path / "eventlog" + # Many LogStart events, no env-update. Cap hits first. + _write_plain_log( + log, + [{"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}] * 10, + ) + result = detect_spark_runtime(str(log), max_events_scanned=5) + assert result.route is Route.UNKNOWN + assert result.spark_runtime is None + assert "no decisive signal" in result.reason.lower() or "no sparklistenerenvironmentupdate" in result.reason.lower() + + +class TestDatabricksRolling: + def test_gpu_in_later_rolled_file(self, tmp_path): + d = tmp_path / "dbrolling" + d.mkdir() + _write_plain_log( + d / "eventlog-2021-06-14--18-00", + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "d", "App Name": "D"}, + env_update({"spark.master": "local"}), + ], + ) + _write_plain_log( + d / "eventlog", + [ + { + "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", + "executionId": 0, + "description": "", + "details": "", + "physicalPlanDescription": "", + "sparkPlanInfo": {}, + "time": 0, + "modifiedConfigs": {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, + } + ], + ) + result = detect_spark_runtime(CspPath(str(d))) + assert result.route is Route.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + + +class TestUnsupportedInput: + def test_spark_native_rolling_dir_raises(self, tmp_path): + d = tmp_path / "eventlog_v2_local-1623876083964" + d.mkdir() + (d / "events_1_local-1623876083964").write_bytes(b"") + with pytest.raises(UnsupportedInputError): + detect_spark_runtime(CspPath(str(d))) + + +class TestReasonStrings: + def test_reason_mentions_runtime_on_profiling(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert "SPARK_RAPIDS" in result.reason + + def test_reason_mentions_full_log_on_qualification(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log(log, [env_update({"spark.master": "local"})]) + result = detect_spark_runtime(str(log)) + assert result.route is Route.QUALIFICATION + assert "walked full log" in result.reason.lower() + + +class TestSourcePathPreserved: + def test_source_path_equals_input_string(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log(log, [env_update({"spark.master": "local"})]) + input_str = str(log) + result = detect_spark_runtime(input_str) + assert result.source_path == input_str From 5f3270e1ccef95ced9c4f0ea973b15e06e9c074b Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:27:00 -0700 Subject: [PATCH 16/29] test(eventlog_detector): anchor parity tests on Scala fixtures Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../test_detector_fixtures.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py new file mode 100644 index 000000000..c598bb54b --- /dev/null +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py @@ -0,0 +1,62 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Anchor tests against fixtures already shipped in the Scala core. + +These are not a full parity sweep. They catch regressions on a small +curated set covering each decisive route. +""" + +from pathlib import Path + +import pytest + +from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime +from spark_rapids_tools.tools.eventlog_detector.types import Route, SparkRuntime + + +REPO_ROOT = Path(__file__).resolve().parents[5] +CORE_FIXTURES = REPO_ROOT / "core" / "src" / "test" / "resources" + + +@pytest.mark.parametrize( + "relative_path,expected_route,expected_runtime", + [ + ( + "spark-events-profiling/eventlog-gpu-dsv2.zstd", + Route.PROFILING, + SparkRuntime.SPARK_RAPIDS, + ), + ( + "spark-events-profiling/eventlog_dsv2.zstd", + Route.QUALIFICATION, + SparkRuntime.SPARK, + ), + ( + "spark-events-qualification/eventlog_same_app_id_1.zstd", + Route.QUALIFICATION, + SparkRuntime.SPARK, + ), + ], +) +def test_detector_matches_expected_route_on_scala_fixture( + relative_path: str, expected_route: Route, expected_runtime: SparkRuntime +) -> None: + fixture = CORE_FIXTURES / relative_path + if not fixture.exists(): + pytest.skip(f"fixture not available: {fixture}") + # Fixtures are ~small; a generous budget keeps this test decisive. + result = detect_spark_runtime(str(fixture), max_events_scanned=5000) + assert result.route is expected_route, result.reason + assert result.spark_runtime is expected_runtime, result.reason From 8c32cf66bd6ac81256f15ae9d0e3678edc0f8950 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:39:20 -0700 Subject: [PATCH 17/29] chore(eventlog_detector): resolve lint findings from full-suite run Replace lazy __getattr__ re-exports with direct imports in __init__.py (fixes E0603 undefined-all-variable and downstream E0611 in tests). Remove unused imports, add class docstrings, move inline json imports to module level, replace list comprehensions with list(), use dict literals, add pylint: disable=too-few-public-methods for single-test-method classes. Pylint score: 10.00/10. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/__init__.py | 35 ++++++------------- .../eventlog_detector/test_classifier.py | 9 +++++ .../tools/eventlog_detector/test_detector.py | 18 +++++++++- .../tools/eventlog_detector/test_markers.py | 11 ++++++ .../tools/eventlog_detector/test_resolver.py | 9 +++++ .../tools/eventlog_detector/test_scanner.py | 20 ++++------- .../tools/eventlog_detector/test_stream.py | 25 +++++++++---- .../tools/eventlog_detector/test_types.py | 29 ++++++++++----- 8 files changed, 100 insertions(+), 56 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py index 023eb042d..619f03bd3 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py @@ -22,7 +22,16 @@ for the full contract and the Scala sources this mirrors. """ -from typing import Any +from .detector import detect_spark_runtime +from .types import ( + DetectionResult, + EventLogDetectionError, + EventLogReadError, + Route, + SparkRuntime, + UnsupportedCompressionError, + UnsupportedInputError, +) __all__ = [ "DetectionResult", @@ -34,27 +43,3 @@ "UnsupportedInputError", "detect_spark_runtime", ] - -# Lazy re-exports: the submodules that back these names are added across -# subsequent tasks. Using ``__getattr__`` defers the import until the name -# is actually accessed, which keeps intermediate test suites importable -# while the package is being built out. -_TYPES_NAMES = { - "DetectionResult", - "EventLogDetectionError", - "EventLogReadError", - "Route", - "SparkRuntime", - "UnsupportedCompressionError", - "UnsupportedInputError", -} - - -def __getattr__(name: str) -> Any: - if name == "detect_spark_runtime": - from .detector import detect_spark_runtime as _fn - return _fn - if name in _TYPES_NAMES: - from . import types as _types - return getattr(_types, name) - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index f3868fe28..dd571fd2e 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -13,6 +13,7 @@ # limitations under the License. """Unit tests for ``eventlog_detector.classifier``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods import pytest @@ -21,11 +22,15 @@ class TestEmptyProperties: + """Test classification with an empty properties dict.""" + def test_empty_props_is_spark(self): assert _classify_runtime({}) is SparkRuntime.SPARK class TestSparkRapids: + """Test SPARK_RAPIDS classification logic.""" + def test_plugin_and_default_enabled(self): props = {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin,bar"} assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS @@ -57,6 +62,8 @@ def test_unparseable_enabled_defaults_to_true(self): class TestAuron: + """Test AURON classification logic.""" + def test_extension_and_default_enabled(self): props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} assert _classify_runtime(props) is SparkRuntime.AURON @@ -77,6 +84,8 @@ def test_auron_enabled_case_insensitive(self): class TestDatabricksPhoton: + """Test Databricks PHOTON classification logic.""" + @pytest.fixture def db_precond_props(self): return { diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py index 77c16998d..36f3dadca 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -13,6 +13,7 @@ # limitations under the License. """Integration tests for ``eventlog_detector.detect_spark_runtime``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods import json from pathlib import Path @@ -45,6 +46,8 @@ def _write_plain_log(path: Path, events: list) -> None: class TestAcceptsStringPath: + """Test that detect_spark_runtime accepts plain string paths.""" + def test_str_input_resolves(self, tmp_path): log = tmp_path / "eventlog" _write_plain_log( @@ -61,6 +64,8 @@ def test_str_input_resolves(self, tmp_path): class TestGpuLog: + """Test detection on GPU event logs.""" + def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): log = tmp_path / "eventlog" _write_plain_log( @@ -79,6 +84,8 @@ def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): class TestCapHit: + """Test detection when the event budget is exhausted before env-update.""" + def test_no_env_update_before_cap_is_unknown(self, tmp_path): log = tmp_path / "eventlog" # Many LogStart events, no env-update. Cap hits first. @@ -89,10 +96,13 @@ def test_no_env_update_before_cap_is_unknown(self, tmp_path): result = detect_spark_runtime(str(log), max_events_scanned=5) assert result.route is Route.UNKNOWN assert result.spark_runtime is None - assert "no decisive signal" in result.reason.lower() or "no sparklistenerenvironmentupdate" in result.reason.lower() + reason = result.reason.lower() + assert "no decisive signal" in reason or "no sparklistenerenvironmentupdate" in reason class TestDatabricksRolling: + """Test detection on Databricks rolling event log directories.""" + def test_gpu_in_later_rolled_file(self, tmp_path): d = tmp_path / "dbrolling" d.mkdir() @@ -125,6 +135,8 @@ def test_gpu_in_later_rolled_file(self, tmp_path): class TestUnsupportedInput: + """Test that unsupported input shapes raise the expected error.""" + def test_spark_native_rolling_dir_raises(self, tmp_path): d = tmp_path / "eventlog_v2_local-1623876083964" d.mkdir() @@ -134,6 +146,8 @@ def test_spark_native_rolling_dir_raises(self, tmp_path): class TestReasonStrings: + """Test the human-readable reason field on DetectionResult.""" + def test_reason_mentions_runtime_on_profiling(self, tmp_path): log = tmp_path / "eventlog" _write_plain_log( @@ -154,6 +168,8 @@ def test_reason_mentions_full_log_on_qualification(self, tmp_path): class TestSourcePathPreserved: + """Test that source_path echoes the original input string.""" + def test_source_path_equals_input_string(self, tmp_path): log = tmp_path / "eventlog" _write_plain_log(log, [env_update({"spark.master": "local"})]) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py index 3f225df94..4234b66f4 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py @@ -13,6 +13,7 @@ # limitations under the License. """Unit tests for ``eventlog_detector.markers``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods import re @@ -20,6 +21,8 @@ class TestGpuMarkers: + """Test GPU plugin marker constants.""" + def test_plugin_substring_exact(self): assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" @@ -28,6 +31,8 @@ def test_gpu_toggle_key_exact(self): class TestAuronMarkers: + """Test Auron extension and enabled marker constants.""" + def test_extension_regex_fullmatches_expected_value(self): # Mirrors AuronParseHelper.extensionRegxMap. pat = re.compile(m.AURON_EXTENSION_REGEX) @@ -42,6 +47,8 @@ def test_auron_enabled_defaults_to_true(self): class TestDatabricksPrecondition: + """Test Databricks precondition key constants.""" + def test_all_three_tag_keys_present(self): assert m.DB_PRECONDITION_KEYS == ( "spark.databricks.clusterUsageTags.clusterAllTags", @@ -51,6 +58,8 @@ def test_all_three_tag_keys_present(self): class TestPhotonMarkers: + """Test Photon marker regex constants.""" + def test_marker_map_fullmatches_expected(self): pats = {k: re.compile(v) for k, v in m.PHOTON_MARKER_REGEX.items()} assert pats[ @@ -73,6 +82,8 @@ def test_all_four_photon_keys(self): class TestDatabricksRollingFileName: + """Test Databricks rolling log file name pattern constants.""" + def test_prefix_is_eventlog(self): assert m.DB_EVENT_LOG_FILE_PREFIX == "eventlog" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py index 5cd0b770c..41d665fe4 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -13,6 +13,7 @@ # limitations under the License. """Unit tests for ``eventlog_detector.resolver``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods from datetime import datetime from pathlib import Path @@ -28,6 +29,8 @@ class TestDatabricksDateParse: + """Test Databricks rolling log file date parsing.""" + def test_bare_eventlog_is_latest_sentinel(self): # Returns None; caller treats None as "sort last". assert _parse_databricks_file_datetime("eventlog") is None @@ -45,6 +48,8 @@ def test_non_eventlog_prefix_returns_none(self): class TestResolveSingleFile: + """Test resolving a single event log file.""" + def test_single_file_returns_single_element_list(self, tmp_path: Path): f = tmp_path / "eventlog.zstd" f.write_bytes(b"x") @@ -54,6 +59,8 @@ def test_single_file_returns_single_element_list(self, tmp_path: Path): class TestResolveDatabricksRollingDir: + """Test resolving a Databricks rolling event log directory.""" + def test_orders_earliest_first_and_bare_eventlog_last(self, tmp_path: Path): d = tmp_path / "dbrolling" d.mkdir() @@ -86,6 +93,8 @@ def test_empty_dir_raises(self, tmp_path: Path): class TestResolveUnsupportedShapes: + """Test that unsupported directory shapes raise UnsupportedInputError.""" + def test_spark_native_rolling_dir_raises(self, tmp_path: Path): d = tmp_path / "eventlog_v2_local-1623876083964" d.mkdir() diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py index 5658f17b6..90c243c71 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py @@ -14,27 +14,21 @@ """Unit tests for ``eventlog_detector.scanner``.""" +import json from pathlib import Path from typing import List -import pytest - from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector.scanner import ( _scan_events, _scan_events_across, ) -from spark_rapids_tools.tools.eventlog_detector.types import ( - SparkRuntime, - Termination, -) +from spark_rapids_tools.tools.eventlog_detector.types import Termination # ---------- Line builders ---------- def env_update(props: dict) -> str: - import json - return json.dumps( { "Event": "SparkListenerEnvironmentUpdate", @@ -47,14 +41,10 @@ def env_update(props: dict) -> str: def log_start(version: str = "3.5.1") -> str: - import json - return json.dumps({"Event": "SparkListenerLogStart", "Spark Version": version}) def app_start(app_id: str = "app-1", app_name: str = "App") -> str: - import json - return json.dumps( { "Event": "SparkListenerApplicationStart", @@ -65,8 +55,6 @@ def app_start(app_id: str = "app-1", app_name: str = "App") -> str: def sql_exec_start(modified_configs: dict) -> str: - import json - return json.dumps( { "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", @@ -84,6 +72,8 @@ def sql_exec_start(modified_configs: dict) -> str: # ---------- Tests for _scan_events (single stream) ---------- class TestScanEvents: + """Tests for _scan_events scanning a single event stream.""" + def test_env_update_with_gpu_is_decisive(self): lines = iter( [ @@ -165,6 +155,8 @@ def _write(path: Path, lines: List[str]) -> CspPath: class TestScanEventsAcross: + """Tests for _scan_events_across scanning across multiple files.""" + def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): f1 = _write( tmp_path / "eventlog-2021-06-14--18-00", diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py index 07cfdfe9c..cc2c0bb5f 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -13,6 +13,7 @@ # limitations under the License. """Unit tests for ``eventlog_detector.stream``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods import gzip from pathlib import Path @@ -72,34 +73,42 @@ def zstd_file(tmp_path: Path) -> CspPath: class TestPlainStream: - def test_yields_all_lines(self, plain_file): + """Test streaming plain-text event logs.""" + + def test_yields_all_lines(self, plain_file): # pylint: disable=redefined-outer-name with _open_event_log_stream(plain_file) as lines: - collected = [ln for ln in lines] + collected = list(lines) assert collected == SAMPLE_LINES class TestGzipStream: - def test_yields_all_lines(self, gz_file): + """Test streaming gzip-compressed event logs.""" + + def test_yields_all_lines(self, gz_file): # pylint: disable=redefined-outer-name with _open_event_log_stream(gz_file) as lines: - collected = [ln for ln in lines] + collected = list(lines) assert collected == SAMPLE_LINES class TestZstdStream: - def test_yields_all_lines(self, zstd_file): + """Test streaming zstd-compressed event logs.""" + + def test_yields_all_lines(self, zstd_file): # pylint: disable=redefined-outer-name with _open_event_log_stream(zstd_file) as lines: - collected = [ln for ln in lines] + collected = list(lines) assert collected == SAMPLE_LINES def test_zst_short_suffix_also_works(self, tmp_path): p = tmp_path / "eventlog.zst" _write_zstd(p) with _open_event_log_stream(CspPath(str(p))) as lines: - collected = [ln for ln in lines] + collected = list(lines) assert collected == SAMPLE_LINES class TestUnsupportedCompression: + """Test that unsupported compression formats raise UnsupportedCompressionError.""" + def test_lz4_raises(self, tmp_path): p = tmp_path / "eventlog.lz4" p.write_bytes(b"not-real-lz4") @@ -123,6 +132,8 @@ def test_lzf_raises(self, tmp_path): class TestIoFailure: + """Test that I/O errors raise EventLogReadError.""" + def test_missing_file_raises_read_error(self, tmp_path): p = tmp_path / "does-not-exist" with pytest.raises(EventLogReadError): diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py index e2c52f028..7e050d6f9 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py @@ -13,6 +13,7 @@ # limitations under the License. """Unit tests for ``eventlog_detector.types``.""" +# pylint: disable=too-few-public-methods # test classes naturally have few methods import pytest @@ -29,6 +30,8 @@ class TestRoute: + """Test the Route string enum.""" + def test_has_three_values(self): assert {r.value for r in Route} == {"QUALIFICATION", "PROFILING", "UNKNOWN"} @@ -38,6 +41,8 @@ def test_is_string_enum(self): class TestSparkRuntime: + """Test the SparkRuntime string enum.""" + def test_values_match_scala_enum_exactly(self): # These strings match org.apache.spark.sql.rapids.tool.util.SparkRuntime # which aether already persists in JobRun.spark_runtime. @@ -53,11 +58,15 @@ def test_is_string_enum(self): class TestTermination: + """Test the Termination enum modes.""" + def test_has_three_modes(self): assert {t.name for t in Termination} == {"DECISIVE", "EXHAUSTED", "CAP_HIT"} class TestDetectionResult: + """Test DetectionResult dataclass semantics.""" + def test_frozen_dataclass(self): result = DetectionResult( route=Route.PROFILING, @@ -74,15 +83,15 @@ def test_frozen_dataclass(self): result.route = Route.UNKNOWN # type: ignore[misc] def test_structural_equality(self): - kwargs = dict( - route=Route.QUALIFICATION, - spark_runtime=SparkRuntime.SPARK, - app_id="a", - spark_version="3.5.1", - event_log_path="/tmp/a", - source_path="/tmp/a", - reason="walked full log, no GPU-family signal", - ) + kwargs = { + "route": Route.QUALIFICATION, + "spark_runtime": SparkRuntime.SPARK, + "app_id": "a", + "spark_version": "3.5.1", + "event_log_path": "/tmp/a", + "source_path": "/tmp/a", + "reason": "walked full log, no GPU-family signal", + } assert DetectionResult(**kwargs) == DetectionResult(**kwargs) assert hash(DetectionResult(**kwargs)) == hash(DetectionResult(**kwargs)) # Distinct payloads compare unequal. @@ -104,6 +113,8 @@ def test_accepts_optional_fields_as_none(self): class TestExceptionHierarchy: + """Test that all detector exceptions form a coherent hierarchy.""" + def test_all_errors_subclass_base(self): for cls in ( UnsupportedInputError, From 8906da091c6b44fa9d4dbb62dc272362e617d485 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:41:17 -0700 Subject: [PATCH 18/29] docs(spec): record realized fixture inventory for event log detector Replace the two placeholder fixture bullets with the concrete three-item inventory actually used by test_detector_fixtures.py, and clarify that Databricks-rolling coverage is handled via synthesized fixtures in test_detector.py since core/src/test/resources/ has no reusable Databricks-rolling-shape fixture. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- .../specs/2026-04-22-eventlog-runtime-detector-design.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md index 12e415af5..e41863bae 100644 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md @@ -247,9 +247,11 @@ All errors subclass `EventLogDetectionError`: Runs `detect_spark_runtime` against a small curated set of existing fixtures under `core/src/test/resources/spark-events-*`: -- `eventlog-gpu-dsv2.zstd` → `Route.PROFILING`, `SPARK_RAPIDS` -- `eventlog_dsv2.zstd` → `Route.QUALIFICATION`, `SPARK` -- A Databricks rolling fixture (to be identified during plan; if none exists we'll synthesize one) +- `spark-events-profiling/eventlog-gpu-dsv2.zstd` → `Route.PROFILING`, `SPARK_RAPIDS` +- `spark-events-profiling/eventlog_dsv2.zstd` → `Route.QUALIFICATION`, `SPARK` +- `spark-events-qualification/eventlog_same_app_id_1.zstd` → `Route.QUALIFICATION`, `SPARK` + +Fixture inventory above is the realized set used by `test_detector_fixtures.py`. Databricks-rolling-dir coverage is exercised via synthesized fixtures in `test_detector.py` rather than a pre-existing Scala fixture, since `core/src/test/resources/` has no Databricks-rolling-shape fixture we can repurpose here. Not a full parity sweep — just anchor points to catch regressions. @@ -261,7 +263,6 @@ Not a full parity sweep — just anchor points to catch regressions. ## 12. Open items for implementation plan -- Confirm `core/src/test/resources/` contains a usable Databricks-rolling fixture; synthesize one if not. - Whether adding `zstandard` as a hard dep needs a `RELEASE.md` note. ## 13. Evolution From 351d9f44ff00a981eff3ec5d5b0748459b952a4b Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:45:53 -0700 Subject: [PATCH 19/29] chore(eventlog_detector): fix E302 blank-line lint in stream.py One missing blank line between the module-level constants block and _classify_suffix. Flake8 E302 fix; no behavior change. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Sayed Bilal Bari --- .../src/spark_rapids_tools/tools/eventlog_detector/stream.py | 1 + 1 file changed, 1 insertion(+) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index c78a8b0e7..b088a5913 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -50,6 +50,7 @@ # Suffixes that unambiguously indicate a codec we do not support in V1. _UNSUPPORTED_CODEC_SUFFIXES = {".lz4", ".lzf", ".snappy"} + def _classify_suffix(path: CspPath) -> str: name = path.base_name().lower() dot = name.rfind(".") From 6c81f37244576fee277915351dd7bdbaf66f85a4 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:53:46 -0700 Subject: [PATCH 20/29] fix(eventlog_detector): narrow _parse_bool to Scala toBoolean semantics Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)`` exactly: only ``"true"``/``"false"`` (case-insensitive) are valid boolean strings; ``"yes"``, ``"no"``, ``"1"``, ``"0"`` and everything else falls back to the default. Removes the overly broad ``_TRUE_STRINGS``/``_FALSE_STRINGS`` sets that caused false-negatives (e.g. ``spark.rapids.sql.enabled=no`` routed as CPU by us but GPU by Scala). Adds a parametrized test to document parity with Scala behaviour. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/classifier.py | 16 +++++++++------- .../tools/eventlog_detector/test_classifier.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py index a67b2f97b..1bb0b9e48 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -28,16 +28,18 @@ from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime -_TRUE_STRINGS = {"true", "1", "yes"} -_FALSE_STRINGS = {"false", "0", "no"} - - def _parse_bool(raw: str, default: bool) -> bool: - """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``.""" + """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``. + + Scala's ``String.toBoolean`` accepts only ``"true"``/``"false"`` + case-insensitively. Everything else (including ``"yes"``, ``"no"``, + ``"1"``, ``"0"``) falls back to ``default`` because the Scala call + would throw ``IllegalArgumentException``. + """ stripped = raw.strip().lower() - if stripped in _TRUE_STRINGS: + if stripped == "true": return True - if stripped in _FALSE_STRINGS: + if stripped == "false": return False return default diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index dd571fd2e..ac4e7ab05 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -60,6 +60,16 @@ def test_unparseable_enabled_defaults_to_true(self): } assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + @pytest.mark.parametrize("bogus_value", ["no", "0", "yes", "1", "", "maybe"]) + def test_non_toboolean_values_default_to_true_matching_scala(self, bogus_value): + # Scala: Try { "no".toBoolean }.getOrElse(true) == true because + # "no" is not parseable. The Python classifier must do the same. + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": bogus_value, + } + assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + class TestAuron: """Test AURON classification logic.""" From b2486a3d72266195810f6e49e1da5d1f1ef3853d Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:53:55 -0700 Subject: [PATCH 21/29] fix(eventlog_detector): whitelist supported codec suffixes Replace the narrow ``_UNSUPPORTED_CODEC_SUFFIXES`` blacklist with an explicit ``_SUPPORTED_SUFFIXES`` whitelist (plain, .inprogress, .gz, .zstd, .zst). Any unrecognised file suffix now raises ``UnsupportedCompressionError`` rather than silently falling through as plain text, which could cause garbled reads on unknown compressed formats. Adds ``_PLAIN_SUFFIXES`` constant and updates the else-branch comment to reflect that unknown-codec fallthrough is no longer possible. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/stream.py | 28 +++++++++++-------- .../tools/eventlog_detector/test_stream.py | 7 +++++ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index b088a5913..f400a058c 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -47,8 +47,15 @@ # PyArrow does NOT recognise ``.zstd`` as a codec suffix, so the byte stream # is raw compressed data that we must decompress ourselves. _ZSTD_MANUAL_SUFFIXES = {".zstd"} -# Suffixes that unambiguously indicate a codec we do not support in V1. -_UNSUPPORTED_CODEC_SUFFIXES = {".lz4", ".lzf", ".snappy"} +# Suffixes treated as plain text (no decompression needed, no scheme check). +_PLAIN_SUFFIXES = {"", ".inprogress"} +# Full whitelist of suffixes the detector accepts. Anything else raises +# ``UnsupportedCompressionError`` — including explicitly-bad codecs like +# ``.lz4`` / ``.lzf`` / ``.snappy`` and any unknown suffix we might +# otherwise fall through as plain text. +_SUPPORTED_SUFFIXES = ( + _PYARROW_AUTO_DECOMP_SUFFIXES | _ZSTD_MANUAL_SUFFIXES | _PLAIN_SUFFIXES +) def _classify_suffix(path: CspPath) -> str: @@ -62,10 +69,11 @@ def _classify_suffix(path: CspPath) -> str: @contextlib.contextmanager def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: suffix = _classify_suffix(path) - if suffix in _UNSUPPORTED_CODEC_SUFFIXES: + if suffix not in _SUPPORTED_SUFFIXES: raise UnsupportedCompressionError( - f"Compression codec '{suffix}' is not supported by the lightweight " - "event log detector. Fall back to the full qualification/profiling " + f"File suffix '{suffix}' is not supported by the lightweight " + "event log detector. Supported: plain, .inprogress, .gz, " + ".zstd, .zst. Fall back to the full qualification/profiling " "pipeline for this log." ) @@ -84,13 +92,9 @@ def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: decompressed: io.RawIOBase = dctx.stream_reader(byte_stream) close_stack.callback(decompressed.close) else: - # For ``.gz``, ``.zst``, ``.inprogress``, and unknown/plain - # suffixes, PyArrow already handles decompression (or there is - # nothing to decompress). Pass the byte stream straight through. - # If the file is actually compressed with an unknown codec the - # scanner will see garbled lines that don't parse as JSON; those - # will be skipped and the caller will see Route.UNKNOWN — the - # correct failure mode for this lightweight path. + # Plain or PyArrow auto-decompressed (.gz/.zst/.inprogress/no + # extension). Pass the byte stream straight through — PyArrow + # has already handled decompression where needed. decompressed = byte_stream text = io.TextIOWrapper(decompressed, encoding="utf-8", errors="replace", newline="") diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py index cc2c0bb5f..dbcaf94b2 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -130,6 +130,13 @@ def test_lzf_raises(self, tmp_path): with _open_event_log_stream(CspPath(str(p))) as _: pass + def test_unknown_suffix_raises(self, tmp_path): + p = tmp_path / "eventlog.weirdcodec" + p.write_bytes(b"some-bytes") + with pytest.raises(UnsupportedCompressionError): + with _open_event_log_stream(CspPath(str(p))) as _: + pass + class TestIoFailure: """Test that I/O errors raise EventLogReadError.""" From d8be4645b403ba98d9a215d7e254110a20b82ef1 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 18:54:01 -0700 Subject: [PATCH 22/29] fix(eventlog_detector): track last-scanned file for event_log_path Add ``last_scanned_path`` to ``_ScanResult`` and update it in ``_scan_events_across`` as each file is opened. In ``detector.py``, prefer ``scan.last_scanned_path`` over ``files[0]`` so that ``DetectionResult.event_log_path`` reflects the file that actually contained the decisive signal, not always the first file in a rolling directory. Extends the Databricks rolling-dir test to assert the correct path is reported. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Sayed Bilal Bari --- .../spark_rapids_tools/tools/eventlog_detector/detector.py | 2 +- .../src/spark_rapids_tools/tools/eventlog_detector/scanner.py | 2 ++ .../tools/eventlog_detector/test_detector.py | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py index e1b9fd91f..faba18cb2 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py @@ -84,7 +84,7 @@ def detect_spark_runtime( else "no SparkListenerEnvironmentUpdate reached" ) - resolved_path = str(files[0]) if files else source_path + resolved_path = scan.last_scanned_path or (str(files[0]) if files else source_path) return DetectionResult( route=route, spark_runtime=runtime, diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py index c695cea02..80c07a2d7 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py @@ -40,6 +40,7 @@ class _ScanResult: env_update_seen: bool = False events_scanned: int = 0 termination: Termination = Termination.EXHAUSTED + last_scanned_path: Optional[str] = None def _scan_events( @@ -119,6 +120,7 @@ def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: if state.events_scanned >= budget: state.termination = Termination.CAP_HIT return state + state.last_scanned_path = str(path) with _open_event_log_stream(path) as lines: state = _scan_events(lines, budget=budget, state=state) if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py index 36f3dadca..bafd07742 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -132,6 +132,10 @@ def test_gpu_in_later_rolled_file(self, tmp_path): result = detect_spark_runtime(CspPath(str(d))) assert result.route is Route.PROFILING assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + # The decisive signal came from the bare `eventlog` file (the latest, + # sorted last after the dated chunk). event_log_path should reflect + # that, not the first-opened file. + assert result.event_log_path.endswith("/eventlog") class TestUnsupportedInput: From 3bebb113550cee5e0f648b6c0bed3fa1a1479d3b Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 19:02:47 -0700 Subject: [PATCH 23/29] docs: remove local implementation plan from the PR The plan document was a development aid for the implementation phase and does not belong in the published change set. The design spec remains under docs/superpowers/specs/ as the authoritative reference. Signed-off-by: Sayed Bilal Bari --- .../2026-04-22-eventlog-runtime-detector.md | 2170 ----------------- 1 file changed, 2170 deletions(-) delete mode 100644 docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md diff --git a/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md b/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md deleted file mode 100644 index 2daa0dbe1..000000000 --- a/docs/superpowers/plans/2026-04-22-eventlog-runtime-detector.md +++ /dev/null @@ -1,2170 +0,0 @@ -# Event Log Runtime Detector Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Ship a lightweight Python function `detect_spark_runtime(path)` that returns a `Route` (`QUALIFICATION` / `PROFILING` / `UNKNOWN`) and best-effort runtime metadata by scanning a bounded prefix of a Spark event log — single file or Databricks rolling directory — without invoking the Scala tools pipeline. - -**Architecture:** Four cohesive units inside a single module under `user_tools/src/spark_rapids_tools/tools/`: path resolver, stream opener (context manager, codec-aware), bounded event scanner (streaming only, no accumulation), pure-function classifier. A single-source-of-truth markers file pins the key strings/regexes to Scala source references. Public surface is the function plus three dataclasses/enums (`Route`, `SparkRuntime`, `DetectionResult`) plus a small exception hierarchy. No CLI. No output files. - -**Tech Stack:** Python 3.10+ (matches `user_tools/pyproject.toml`), stdlib (`gzip`, `json`, `io`, `re`, `contextlib`, `dataclasses`, `enum`, `pathlib`, `datetime`), `zstandard` (new hard dep), `spark_rapids_tools.storagelib.csppath.CspPath` + `spark_rapids_tools.storagelib.cspfs.CspFs` for cloud/local I/O, `pytest` for tests. - -**Spec:** `docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md` - ---- - -## File Structure - -**New files (all under `user_tools/`):** - -- `src/spark_rapids_tools/tools/eventlog_detector/__init__.py` — re-exports public API. -- `src/spark_rapids_tools/tools/eventlog_detector/markers.py` — single source of truth for property keys, regexes, substrings. Each constant carries a `# Scala source: :` comment. -- `src/spark_rapids_tools/tools/eventlog_detector/types.py` — `Route`, `SparkRuntime`, `DetectionResult`, `Termination` enum, exception hierarchy. -- `src/spark_rapids_tools/tools/eventlog_detector/resolver.py` — `_resolve_event_log_files(path) -> tuple[str, list[CspPath]]`. -- `src/spark_rapids_tools/tools/eventlog_detector/stream.py` — `_open_event_log_stream(path)` context manager yielding `Iterator[str]`. -- `src/spark_rapids_tools/tools/eventlog_detector/scanner.py` — `_scan_events(lines, budget) -> _ScanResult` and `_scan_events_across(paths, budget) -> _ScanResult`. -- `src/spark_rapids_tools/tools/eventlog_detector/classifier.py` — `_classify_runtime(spark_properties) -> SparkRuntime`. -- `src/spark_rapids_tools/tools/eventlog_detector/detector.py` — top-level `detect_spark_runtime()` that ties everything together. - -**New tests (under `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/`):** - -- `__init__.py` -- `conftest.py` — shared fixtures (synthesized event log files in `tmp_path`). -- `test_markers.py` -- `test_types.py` -- `test_classifier.py` -- `test_resolver.py` -- `test_stream.py` -- `test_scanner.py` -- `test_detector.py` — integration tests covering the public API end-to-end. -- `test_detector_fixtures.py` — runs the detector against the existing `core/src/test/resources/spark-events-*` fixtures with expected `Route` labels. - -**Modified files:** - -- `user_tools/pyproject.toml` — add `zstandard` to `dependencies`. -- `user_tools/src/spark_rapids_tools/tools/__init__.py` — add re-export line so `from spark_rapids_tools.tools import detect_spark_runtime` works (if the existing `__init__.py` follows that pattern — verify in Task 2). - -**Rationale for splitting into many small files:** the spec defines four independently testable components plus a shared types/markers layer. Putting each in its own file keeps files small, lets each test file focus on one component, and makes reviewer diffs surgical. This matches existing layout in `user_tools/src/spark_rapids_tools/tools/` (several single-purpose files there already). - ---- - -## Task 1: Add `zstandard` dependency - -**Files:** - -- Modify: `user_tools/pyproject.toml` - -- [ ] **Step 1: Read current dependencies block** - -Run: `sed -n '/^dependencies = \[/,/^\]/p' user_tools/pyproject.toml` -Expected: prints the current `dependencies = [ ... ]` block. `zstandard` must not be present. - -- [ ] **Step 2: Add `zstandard` to dependencies** - -Use Edit to insert a new line after the existing `"pyYAML>=6.0.2",` line (or similar stable anchor — whichever comes last alphabetically near the Z range). Add: - -``` - # Decompresses Spark event logs written with --conf spark.eventLog.compress=true - # spark.io.compression.codec=zstd (the common default). Used by - # spark_rapids_tools.tools.eventlog_detector. - "zstandard>=0.22.0", -``` - -The exact Edit call (using a unique anchor from the current file — pick the existing `# used for retrieving available memory on the host` comment block, which precedes `"psutil==7.0.0"`): - -```python -Edit( - file_path="user_tools/pyproject.toml", - old_string=' # used for retrieving available memory on the host\n "psutil==7.0.0",', - new_string=' # used for retrieving available memory on the host\n "psutil==7.0.0",\n # Decompresses Spark event logs with zstd codec. Used by\n # spark_rapids_tools.tools.eventlog_detector.\n "zstandard>=0.22.0",' -) -``` - -- [ ] **Step 3: Install the updated package locally** - -Run: `pip install -e user_tools/` -Expected: `zstandard-` appears in the output; install completes successfully. - -- [ ] **Step 4: Verify `zstandard` is importable** - -Run: `python -c "import zstandard; print(zstandard.__version__)"` -Expected: prints a version string, no `ModuleNotFoundError`. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/pyproject.toml -git commit -m "build(user_tools): add zstandard dep for event log detector" -``` - ---- - -## Task 2: Create module skeleton and `__init__.py` - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py` -- Read (do not modify yet): `user_tools/src/spark_rapids_tools/tools/__init__.py` - -- [ ] **Step 1: Read the existing `tools/__init__.py` to learn the re-export convention** - -Run: `cat user_tools/src/spark_rapids_tools/tools/__init__.py` -Record: whether the file has explicit re-exports, `__all__`, or is empty. This determines whether Task 9 needs to append anything there. - -- [ ] **Step 2: Create the package skeleton** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Lightweight event log runtime detector. - -Public entry point: ``detect_spark_runtime``. Returns a ``DetectionResult`` -carrying a routing decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) -and best-effort metadata, by scanning a bounded prefix of a Spark event log. - -See docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md -for the full contract and the Scala sources this mirrors. -""" - -from spark_rapids_tools.tools.eventlog_detector.detector import detect_spark_runtime -from spark_rapids_tools.tools.eventlog_detector.types import ( - DetectionResult, - EventLogDetectionError, - EventLogReadError, - Route, - SparkRuntime, - UnsupportedCompressionError, - UnsupportedInputError, -) - -__all__ = [ - "DetectionResult", - "EventLogDetectionError", - "EventLogReadError", - "Route", - "SparkRuntime", - "UnsupportedCompressionError", - "UnsupportedInputError", - "detect_spark_runtime", -] -``` - -This will `ImportError` until the submodules exist. That's fine; subsequent tasks fill them in. We create it first so each later task's "does it import?" smoke test covers the integration path. - -- [ ] **Step 3: Create empty test package** - -Create `user_tools/tests/spark_rapids_tools_ut/tools/__init__.py` (empty) and `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py` (empty). First check whether `tools/__init__.py` already exists: - -Run: `test -f user_tools/tests/spark_rapids_tools_ut/tools/__init__.py && echo EXISTS || echo MISSING` - -If `MISSING`, create it with the Apache 2 header as a docstring module: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Tests for ``spark_rapids_tools.tools``.""" -``` - -Then create `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py` with the same header. - -- [ ] **Step 4: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py \ - user_tools/tests/spark_rapids_tools_ut/tools/__init__.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/__init__.py -git commit -m "feat(eventlog_detector): package skeleton and public re-exports" -``` - -Note: the package re-exports will fail to resolve until Task 3/9, but the commit is self-contained as a new empty package. Do not run the imports yet. - ---- - -## Task 3: Types — `Route`, `SparkRuntime`, `DetectionResult`, exceptions - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py` - -- [ ] **Step 1: Write the failing test** - -Create `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.types``.""" - -import pytest - -from spark_rapids_tools.tools.eventlog_detector.types import ( - DetectionResult, - EventLogDetectionError, - EventLogReadError, - Route, - SparkRuntime, - Termination, - UnsupportedCompressionError, - UnsupportedInputError, -) - - -class TestRoute: - def test_has_three_values(self): - assert {r.value for r in Route} == {"QUALIFICATION", "PROFILING", "UNKNOWN"} - - def test_is_string_enum(self): - # str subclass means aether can compare against plain strings. - assert Route.PROFILING == "PROFILING" - - -class TestSparkRuntime: - def test_values_match_scala_enum_exactly(self): - # These strings match org.apache.spark.sql.rapids.tool.util.SparkRuntime - # which aether already persists in JobRun.spark_runtime. - assert {r.value for r in SparkRuntime} == { - "SPARK", - "SPARK_RAPIDS", - "PHOTON", - "AURON", - } - - def test_is_string_enum(self): - assert SparkRuntime.SPARK_RAPIDS == "SPARK_RAPIDS" - - -class TestTermination: - def test_has_three_modes(self): - assert {t.name for t in Termination} == {"DECISIVE", "EXHAUSTED", "CAP_HIT"} - - -class TestDetectionResult: - def test_frozen_dataclass(self): - result = DetectionResult( - route=Route.PROFILING, - spark_runtime=SparkRuntime.SPARK_RAPIDS, - app_id="app-1", - spark_version="3.5.1", - event_log_path="/tmp/x", - source_path="/tmp/x", - reason="decisive: classified as SPARK_RAPIDS", - ) - with pytest.raises((AttributeError, Exception)): - result.route = Route.UNKNOWN # type: ignore[misc] - - def test_accepts_optional_fields_as_none(self): - result = DetectionResult( - route=Route.UNKNOWN, - spark_runtime=None, - app_id=None, - spark_version=None, - event_log_path="/tmp/x", - source_path="/tmp/x", - reason="no decisive signal within bounded scan", - ) - assert result.route is Route.UNKNOWN - assert result.spark_runtime is None - - -class TestExceptionHierarchy: - def test_all_errors_subclass_base(self): - for cls in ( - UnsupportedInputError, - UnsupportedCompressionError, - EventLogReadError, - ): - assert issubclass(cls, EventLogDetectionError) - - def test_base_is_value_error(self): - # Using ValueError as the common ancestor so callers who catch - # ValueError (a reasonable default for bad input) still see these. - assert issubclass(EventLogDetectionError, ValueError) -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py -v` -Expected: `ModuleNotFoundError: No module named 'spark_rapids_tools.tools.eventlog_detector.types'`. - -- [ ] **Step 3: Implement `types.py`** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Types, enums, and exceptions for the event log runtime detector.""" - -from dataclasses import dataclass -from enum import Enum -from typing import Optional - - -class Route(str, Enum): - """Routing decision returned to the caller.""" - - QUALIFICATION = "QUALIFICATION" - PROFILING = "PROFILING" - UNKNOWN = "UNKNOWN" - - -class SparkRuntime(str, Enum): - """Runtime taxonomy. - - Values match ``org.apache.spark.sql.rapids.tool.util.SparkRuntime`` in the - Scala core so aether's ``JobRun.spark_runtime`` column stays compatible. - """ - - SPARK = "SPARK" - SPARK_RAPIDS = "SPARK_RAPIDS" - PHOTON = "PHOTON" - AURON = "AURON" - - -class Termination(Enum): - """How the scanner stopped. Used internally by the detector flow.""" - - DECISIVE = "DECISIVE" # classification returned non-SPARK - EXHAUSTED = "EXHAUSTED" # walked every file to EOF under the budget - CAP_HIT = "CAP_HIT" # hit max_events_scanned before exhausting files - - -@dataclass(frozen=True) -class DetectionResult: - """Result returned by ``detect_spark_runtime``. - - ``spark_runtime`` is best-effort metadata. ``None`` is valid (e.g., when - ``route`` is ``UNKNOWN`` because env-update was never seen). - """ - - route: Route - spark_runtime: Optional[SparkRuntime] - app_id: Optional[str] - spark_version: Optional[str] - event_log_path: str - source_path: str - reason: str - - -class EventLogDetectionError(ValueError): - """Base class for detector errors.""" - - -class UnsupportedInputError(EventLogDetectionError): - """Input shape is outside V1 scope (multi-app dir, wildcard, comma list, ...).""" - - -class UnsupportedCompressionError(EventLogDetectionError): - """File uses a compression codec the V1 detector does not handle.""" - - -class EventLogReadError(EventLogDetectionError): - """Wraps an underlying I/O failure when reading the event log.""" -``` - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py -v` -Expected: all 7 test cases pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py -git commit -m "feat(eventlog_detector): routing types and exception hierarchy" -``` - ---- - -## Task 4: Markers module (single source of truth pinned to Scala) - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py` - -- [ ] **Step 1: Write the failing test** - -Create `test_markers.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.markers``.""" - -import re - -from spark_rapids_tools.tools.eventlog_detector import markers as m - - -class TestGpuMarkers: - def test_plugin_substring_exact(self): - assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" - - def test_gpu_toggle_key_exact(self): - assert m.GPU_ENABLED_KEY == "spark.rapids.sql.enabled" - - -class TestAuronMarkers: - def test_extension_regex_fullmatches_expected_value(self): - # Mirrors AuronParseHelper.extensionRegxMap. - pat = re.compile(m.AURON_EXTENSION_REGEX) - assert pat.fullmatch("org.apache.spark.sql.AuronSparkSessionExtension") - assert pat.fullmatch("whatever.AuronSparkSessionExtension.more") - assert not pat.fullmatch("org.apache.spark.sql.SomeOtherExtension") - - def test_auron_enabled_defaults_to_true(self): - assert m.AURON_ENABLED_DEFAULT == "true" - assert m.AURON_ENABLED_KEY == "spark.auron.enabled" - assert m.AURON_SPARK_EXTENSIONS_KEY == "spark.sql.extensions" - - -class TestDatabricksPrecondition: - def test_all_three_tag_keys_present(self): - assert m.DB_PRECONDITION_KEYS == ( - "spark.databricks.clusterUsageTags.clusterAllTags", - "spark.databricks.clusterUsageTags.clusterId", - "spark.databricks.clusterUsageTags.clusterName", - ) - - -class TestPhotonMarkers: - def test_marker_map_fullmatches_expected(self): - pats = {k: re.compile(v) for k, v in m.PHOTON_MARKER_REGEX.items()} - assert pats[ - "spark.databricks.clusterUsageTags.sparkVersion" - ].fullmatch("11.3.x-photon-scala2.12") - assert pats[ - "spark.databricks.clusterUsageTags.runtimeEngine" - ].fullmatch("PHOTON") - assert not pats[ - "spark.databricks.clusterUsageTags.runtimeEngine" - ].fullmatch("STANDARD") - - def test_all_four_photon_keys(self): - assert set(m.PHOTON_MARKER_REGEX) == { - "spark.databricks.clusterUsageTags.sparkVersion", - "spark.databricks.clusterUsageTags.effectiveSparkVersion", - "spark.databricks.clusterUsageTags.sparkImageLabel", - "spark.databricks.clusterUsageTags.runtimeEngine", - } - - -class TestDatabricksRollingFileName: - def test_prefix_is_eventlog(self): - assert m.DB_EVENT_LOG_FILE_PREFIX == "eventlog" - - def test_date_pattern_parses_scala_format(self): - pat = re.compile(m.DB_EVENT_LOG_DATE_REGEX) - # Scala's getDBEventLogFileDate splits on '--' and parses - # 'eventlog-YYYY-MM-DD--HH-MM[.codec]'. - assert pat.search("eventlog-2021-06-14--20-00.gz") - assert pat.search("eventlog-2021-06-14--20-00") - assert not pat.search("eventlog") # bare eventlog has no date -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py -v` -Expected: `ModuleNotFoundError`. - -- [ ] **Step 3: Implement `markers.py`** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Single source of truth for runtime-detection markers. - -Each constant below is pinned to a specific Scala source location. If -the Scala rule changes, update the constant here and the parity tests -under ``tests/spark_rapids_tools_ut/tools/eventlog_detector/`` will -catch any drift on a re-run. -""" - -from typing import Mapping, Tuple - -# --------------------------------------------------------------------------- -# SPARK_RAPIDS (GPU) markers -# Scala source: core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121 -# --------------------------------------------------------------------------- -GPU_PLUGIN_KEY: str = "spark.plugins" -GPU_PLUGIN_CLASS_SUBSTRING: str = "com.nvidia.spark.SQLPlugin" -GPU_ENABLED_KEY: str = "spark.rapids.sql.enabled" -# Default when GPU_ENABLED_KEY is missing or unparseable as bool. Matches -# Scala ``Try { ... }.getOrElse(true)`` in isPluginEnabled. -GPU_ENABLED_DEFAULT: bool = True - -# --------------------------------------------------------------------------- -# AURON markers -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172 -# --------------------------------------------------------------------------- -AURON_SPARK_EXTENSIONS_KEY: str = "spark.sql.extensions" -AURON_EXTENSION_REGEX: str = r".*AuronSparkSessionExtension.*" -AURON_ENABLED_KEY: str = "spark.auron.enabled" -AURON_ENABLED_DEFAULT: str = "true" - -# --------------------------------------------------------------------------- -# Databricks precondition (all three keys must be non-empty) -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58 -# and DatabricksParseHelper.scala:188-190 -# --------------------------------------------------------------------------- -DB_PRECONDITION_KEYS: Tuple[str, str, str] = ( - "spark.databricks.clusterUsageTags.clusterAllTags", - "spark.databricks.clusterUsageTags.clusterId", - "spark.databricks.clusterUsageTags.clusterName", -) - -# --------------------------------------------------------------------------- -# PHOTON markers (any one fullmatches once Databricks precondition holds) -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151 -# --------------------------------------------------------------------------- -PHOTON_MARKER_REGEX: Mapping[str, str] = { - "spark.databricks.clusterUsageTags.sparkVersion": r".*-photon-.*", - "spark.databricks.clusterUsageTags.effectiveSparkVersion": r".*-photon-.*", - "spark.databricks.clusterUsageTags.sparkImageLabel": r".*-photon-.*", - "spark.databricks.clusterUsageTags.runtimeEngine": r"PHOTON", -} - -# --------------------------------------------------------------------------- -# Databricks rolling event-log file layout -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:57 -# and :458-478 (date parse in getDBEventLogFileDate) -# --------------------------------------------------------------------------- -DB_EVENT_LOG_FILE_PREFIX: str = "eventlog" -# Matches the dated form ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` used by -# ``DatabricksRollingEventLogFilesFileReader``. Bare ``eventlog`` has no -# match and is treated as "latest" (sorted last) by the resolver. -DB_EVENT_LOG_DATE_REGEX: str = ( - r"^eventlog-(\d{4})-(\d{2})-(\d{2})--(\d{2})-(\d{2})(?:\.[A-Za-z0-9]+)?$" -) - -# --------------------------------------------------------------------------- -# Supported Spark listener event names -# --------------------------------------------------------------------------- -EVENT_LOG_START: str = "SparkListenerLogStart" -EVENT_APPLICATION_START: str = "SparkListenerApplicationStart" -EVENT_ENVIRONMENT_UPDATE: str = "SparkListenerEnvironmentUpdate" -EVENT_SQL_EXECUTION_START: str = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" -# Spark's actual SQLExecutionStart class name carries a package prefix in -# event logs. The unqualified shortname is sometimes used in test fixtures. -EVENT_SQL_EXECUTION_START_SHORTNAME: str = "SparkListenerSQLExecutionStart" -``` - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py -v` -Expected: all assertions pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py -git commit -m "feat(eventlog_detector): Scala-pinned markers module" -``` - ---- - -## Task 5: Classifier (pure function over a property dict) - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py` - -- [ ] **Step 1: Write the failing test** - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.classifier``.""" - -import pytest - -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime -from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime - - -class TestEmptyProperties: - def test_empty_props_is_spark(self): - assert _classify_runtime({}) is SparkRuntime.SPARK - - -class TestSparkRapids: - def test_plugin_and_default_enabled(self): - props = {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin,bar"} - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS - - def test_plugin_with_enabled_true(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "true", - } - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS - - def test_plugin_with_enabled_false_demotes_to_spark(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "false", - } - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_enabled_true_without_plugin_is_still_spark(self): - props = {"spark.rapids.sql.enabled": "true"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_unparseable_enabled_defaults_to_true(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "not-a-bool", - } - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS - - -class TestAuron: - def test_extension_and_default_enabled(self): - props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} - assert _classify_runtime(props) is SparkRuntime.AURON - - def test_extension_and_enabled_false_demotes_to_spark(self): - props = { - "spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension", - "spark.auron.enabled": "FALSE", - } - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_auron_enabled_case_insensitive(self): - props = { - "spark.sql.extensions": "AuronSparkSessionExtension", - "spark.auron.enabled": " TrUe ", - } - assert _classify_runtime(props) is SparkRuntime.AURON - - -class TestDatabricksPhoton: - @pytest.fixture - def db_precond_props(self): - return { - "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", - "spark.databricks.clusterUsageTags.clusterId": "1234", - "spark.databricks.clusterUsageTags.clusterName": "dev-cluster", - } - - def test_precondition_only_is_spark(self, db_precond_props): - assert _classify_runtime(db_precond_props) is SparkRuntime.SPARK - - def test_precondition_plus_photon_version(self, db_precond_props): - props = { - **db_precond_props, - "spark.databricks.clusterUsageTags.sparkVersion": "11.3.x-photon-scala2.12", - } - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_precondition_plus_photon_engine(self, db_precond_props): - props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_photon_marker_without_precondition_is_spark(self): - props = {"spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_photon_engine_other_value_is_spark(self, db_precond_props): - props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "STANDARD"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - -class TestPriority: - """PHOTON > AURON > SPARK_RAPIDS > SPARK when markers coexist.""" - - def test_photon_beats_spark_rapids(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", - "spark.databricks.clusterUsageTags.clusterId": "1", - "spark.databricks.clusterUsageTags.clusterName": "c", - "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON", - } - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_auron_beats_spark_rapids(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.sql.extensions": "AuronSparkSessionExtension", - } - assert _classify_runtime(props) is SparkRuntime.AURON -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py -v` -Expected: `ModuleNotFoundError`. - -- [ ] **Step 3: Implement the classifier** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Pure-function runtime classifier. - -``_classify_runtime`` maps a merged Spark properties dict to a -``SparkRuntime``. Priority order when multiple markers are present: -``PHOTON > AURON > SPARK_RAPIDS > SPARK``. This is a deterministic Python -choice; Scala's plugin iteration order is undefined when multiple -plugins claim a runtime, but in practice markers do not overlap. -""" - -import re -from typing import Mapping - -from spark_rapids_tools.tools.eventlog_detector import markers as m -from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime - - -_TRUE_STRINGS = {"true", "1", "yes"} -_FALSE_STRINGS = {"false", "0", "no"} - - -def _parse_bool(raw: str, default: bool) -> bool: - """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``.""" - stripped = raw.strip().lower() - if stripped in _TRUE_STRINGS: - return True - if stripped in _FALSE_STRINGS: - return False - return default - - -def _is_spark_rapids(props: Mapping[str, str]) -> bool: - plugins = props.get(m.GPU_PLUGIN_KEY, "") - if m.GPU_PLUGIN_CLASS_SUBSTRING not in plugins: - return False - raw = props.get(m.GPU_ENABLED_KEY) - if raw is None: - return m.GPU_ENABLED_DEFAULT - return _parse_bool(raw, default=m.GPU_ENABLED_DEFAULT) - - -def _is_auron(props: Mapping[str, str]) -> bool: - extensions = props.get(m.AURON_SPARK_EXTENSIONS_KEY) - if extensions is None or not re.fullmatch(m.AURON_EXTENSION_REGEX, extensions): - return False - enabled_raw = props.get(m.AURON_ENABLED_KEY, m.AURON_ENABLED_DEFAULT) - return enabled_raw.strip().lower() == m.AURON_ENABLED_DEFAULT - - -def _is_databricks(props: Mapping[str, str]) -> bool: - return all(props.get(k, "").strip() for k in m.DB_PRECONDITION_KEYS) - - -def _is_photon(props: Mapping[str, str]) -> bool: - if not _is_databricks(props): - return False - for key, pattern in m.PHOTON_MARKER_REGEX.items(): - value = props.get(key) - if value is not None and re.fullmatch(pattern, value): - return True - return False - - -def _classify_runtime(props: Mapping[str, str]) -> SparkRuntime: - # Priority: PHOTON > AURON > SPARK_RAPIDS > SPARK. - if _is_photon(props): - return SparkRuntime.PHOTON - if _is_auron(props): - return SparkRuntime.AURON - if _is_spark_rapids(props): - return SparkRuntime.SPARK_RAPIDS - return SparkRuntime.SPARK -``` - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py -v` -Expected: all tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py -git commit -m "feat(eventlog_detector): classifier mirroring Scala priority" -``` - ---- - -## Task 6: Stream opener (context-managed codec-aware line iterator) - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py` - -- [ ] **Step 1: Write the failing test** - -Create `test_stream.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.stream``.""" - -import gzip -from pathlib import Path - -import pytest -import zstandard as zstd - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream -from spark_rapids_tools.tools.eventlog_detector.types import ( - EventLogReadError, - UnsupportedCompressionError, -) - - -SAMPLE_LINES = [ - '{"Event":"SparkListenerLogStart","Spark Version":"3.5.1"}', - '{"Event":"SparkListenerApplicationStart","App ID":"app-1"}', - '{"Event":"SparkListenerEnvironmentUpdate","Spark Properties":{}}', -] - - -def _write_plain(path: Path) -> None: - path.write_text("\n".join(SAMPLE_LINES) + "\n", encoding="utf-8") - - -def _write_gz(path: Path) -> None: - with gzip.open(path, "wt", encoding="utf-8") as fh: - fh.write("\n".join(SAMPLE_LINES) + "\n") - - -def _write_zstd(path: Path) -> None: - cctx = zstd.ZstdCompressor() - raw = ("\n".join(SAMPLE_LINES) + "\n").encode("utf-8") - path.write_bytes(cctx.compress(raw)) - - -@pytest.fixture -def plain_file(tmp_path: Path) -> CspPath: - p = tmp_path / "eventlog.inprogress" - _write_plain(p) - return CspPath(str(p)) - - -@pytest.fixture -def gz_file(tmp_path: Path) -> CspPath: - p = tmp_path / "eventlog.gz" - _write_gz(p) - return CspPath(str(p)) - - -@pytest.fixture -def zstd_file(tmp_path: Path) -> CspPath: - p = tmp_path / "eventlog.zstd" - _write_zstd(p) - return CspPath(str(p)) - - -class TestPlainStream: - def test_yields_all_lines(self, plain_file): - with _open_event_log_stream(plain_file) as lines: - collected = [ln for ln in lines] - assert collected == SAMPLE_LINES - - -class TestGzipStream: - def test_yields_all_lines(self, gz_file): - with _open_event_log_stream(gz_file) as lines: - collected = [ln for ln in lines] - assert collected == SAMPLE_LINES - - -class TestZstdStream: - def test_yields_all_lines(self, zstd_file): - with _open_event_log_stream(zstd_file) as lines: - collected = [ln for ln in lines] - assert collected == SAMPLE_LINES - - def test_zst_short_suffix_also_works(self, tmp_path): - p = tmp_path / "eventlog.zst" - _write_zstd(p) - with _open_event_log_stream(CspPath(str(p))) as lines: - collected = [ln for ln in lines] - assert collected == SAMPLE_LINES - - -class TestUnsupportedCompression: - def test_lz4_raises(self, tmp_path): - p = tmp_path / "eventlog.lz4" - p.write_bytes(b"not-real-lz4") - with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: - pass - - def test_snappy_raises(self, tmp_path): - p = tmp_path / "eventlog.snappy" - p.write_bytes(b"not-real-snappy") - with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: - pass - - def test_lzf_raises(self, tmp_path): - p = tmp_path / "eventlog.lzf" - p.write_bytes(b"not-real-lzf") - with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: - pass - - -class TestIoFailure: - def test_missing_file_raises_read_error(self, tmp_path): - p = tmp_path / "does-not-exist" - with pytest.raises(EventLogReadError): - with _open_event_log_stream(CspPath(str(p))) as lines: - next(iter(lines)) -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py -v` -Expected: `ModuleNotFoundError`. - -- [ ] **Step 3: Implement the stream opener** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Codec-aware context-managed line streamer for Spark event logs. - -Opens the file via ``CspPath.open_input_stream()``, applies the right -decompression layer based on extension, wraps in a text decoder, and -yields an ``Iterator[str]``. On exit the context manager closes every -layer in reverse order. Streaming only — no buffering of the full file. -""" - -import contextlib -import gzip -import io -from typing import Iterator - -import zstandard as zstd - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.types import ( - EventLogReadError, - UnsupportedCompressionError, -) - - -_GZIP_SUFFIXES = {".gz"} -_ZSTD_SUFFIXES = {".zstd", ".zst"} -# Suffixes that unambiguously indicate a codec we do not support in V1. -_UNSUPPORTED_CODEC_SUFFIXES = {".lz4", ".lzf", ".snappy"} -# Treated as plain text. -_PLAIN_SUFFIXES = {".inprogress", ""} - - -def _classify_suffix(path: CspPath) -> str: - name = path.base_name().lower() - dot = name.rfind(".") - if dot < 0: - return "" - return name[dot:] - - -@contextlib.contextmanager -def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: - suffix = _classify_suffix(path) - if suffix in _UNSUPPORTED_CODEC_SUFFIXES: - raise UnsupportedCompressionError( - f"Compression codec '{suffix}' is not supported by the lightweight " - "event log detector. Fall back to the full qualification/profiling " - "pipeline for this log." - ) - - try: - byte_stream = path.open_input_stream() - except Exception as exc: - raise EventLogReadError(f"Failed to open event log {path}: {exc}") from exc - - close_stack = contextlib.ExitStack() - close_stack.callback(byte_stream.close) - try: - if suffix in _GZIP_SUFFIXES: - decompressed: io.IOBase = gzip.GzipFile(fileobj=byte_stream, mode="rb") - close_stack.callback(decompressed.close) - elif suffix in _ZSTD_SUFFIXES: - dctx = zstd.ZstdDecompressor() - # stream_reader supports .read(); we need a readable binary layer - # below TextIOWrapper. read1 emulation is good enough for line iter. - decompressed = dctx.stream_reader(byte_stream) - close_stack.callback(decompressed.close) - elif suffix in _PLAIN_SUFFIXES or suffix not in _UNSUPPORTED_CODEC_SUFFIXES: - # Unknown/empty suffix → best-effort treat as plain text. If the - # file is actually compressed with an unknown codec the scanner - # will simply see garbled lines that don't parse as JSON and be - # skipped; env-update will never be reached and the caller will - # see Route.UNKNOWN. That is the right failure mode here. - decompressed = byte_stream - else: # pragma: no cover — every branch covered above - raise UnsupportedCompressionError(f"Unsupported suffix: {suffix}") - - # Line-at-a-time text iterator over the decompressed stream. - text = io.TextIOWrapper(decompressed, encoding="utf-8", errors="replace", newline="") - close_stack.callback(text.close) - - def line_iter() -> Iterator[str]: - for raw in text: - # Strip the trailing newline to match the "one event per line" - # contract. Empty lines are legal and skipped by the caller. - yield raw.rstrip("\r\n") - - try: - yield line_iter() - except Exception as exc: - # Convert any read-time I/O error into a typed domain error. - raise EventLogReadError(f"Error reading event log {path}: {exc}") from exc - finally: - close_stack.close() -``` - -Note on the suffix decision tree: `_PLAIN_SUFFIXES` intentionally includes `""` (no extension) so paths like `eventlog` or `foo/events_1_app-xyz` are read as plain text. - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py -v` -Expected: all tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py -git commit -m "feat(eventlog_detector): context-managed codec-aware line streamer" -``` - ---- - -## Task 7: Path resolver (single file + Databricks rolling dir) - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py` - -- [ ] **Step 1: Write the failing test** - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.resolver``.""" - -from datetime import datetime -from pathlib import Path - -import pytest - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.resolver import ( - _parse_databricks_file_datetime, - _resolve_event_log_files, -) -from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError - - -class TestDatabricksDateParse: - def test_bare_eventlog_is_latest_sentinel(self): - # Returns None; caller treats None as "sort last". - assert _parse_databricks_file_datetime("eventlog") is None - - def test_dated_file_parses(self): - dt = _parse_databricks_file_datetime("eventlog-2021-06-14--20-00.gz") - assert dt == datetime(2021, 6, 14, 20, 0) - - def test_dated_file_without_codec_parses(self): - dt = _parse_databricks_file_datetime("eventlog-2022-01-02--03-04") - assert dt == datetime(2022, 1, 2, 3, 4) - - def test_non_eventlog_prefix_returns_none(self): - assert _parse_databricks_file_datetime("application_1234.log") is None - - -class TestResolveSingleFile: - def test_single_file_returns_single_element_list(self, tmp_path: Path): - f = tmp_path / "eventlog.zstd" - f.write_bytes(b"x") - source, files = _resolve_event_log_files(CspPath(str(f))) - assert source == str(f) - assert [p.base_name() for p in files] == ["eventlog.zstd"] - - -class TestResolveDatabricksRollingDir: - def test_orders_earliest_first_and_bare_eventlog_last(self, tmp_path: Path): - d = tmp_path / "dbrolling" - d.mkdir() - (d / "eventlog").write_bytes(b"") - (d / "eventlog-2021-06-14--20-00.gz").write_bytes(b"") - (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") - source, files = _resolve_event_log_files(CspPath(str(d))) - assert source == str(d) - names = [p.base_name() for p in files] - # Earliest dated file first; bare `eventlog` sorts last (treated as - # "current/latest" per Scala). - assert names == [ - "eventlog-2021-06-14--18-00.gz", - "eventlog-2021-06-14--20-00.gz", - "eventlog", - ] - - def test_dir_with_no_eventlog_prefix_raises(self, tmp_path: Path): - d = tmp_path / "empty" - d.mkdir() - (d / "application_1.log").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) - - def test_empty_dir_raises(self, tmp_path: Path): - d = tmp_path / "blank" - d.mkdir() - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) - - -class TestResolveUnsupportedShapes: - def test_spark_native_rolling_dir_raises(self, tmp_path: Path): - d = tmp_path / "eventlog_v2_local-1623876083964" - d.mkdir() - (d / "events_1_local-1623876083964").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) - - def test_generic_multi_app_dir_raises(self, tmp_path: Path): - d = tmp_path / "multi" - d.mkdir() - (d / "app-1.zstd").write_bytes(b"") - (d / "app-2.zstd").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py -v` -Expected: `ModuleNotFoundError`. - -- [ ] **Step 3: Implement the resolver** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Input-path resolution for the event log detector. - -Resolves one user-supplied path into a list of one or more concrete files -to read, in scan order. Supports a single file or a Databricks rolling -directory. Spark-native rolling, generic multi-app directories, wildcards, -and comma lists raise ``UnsupportedInputError`` and are expected to fall -back to the full Scala pipeline. -""" - -import re -from datetime import datetime -from typing import List, Optional, Tuple - -from spark_rapids_tools.storagelib import CspFs, CspPath -from spark_rapids_tools.tools.eventlog_detector import markers as m -from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError - - -_DB_DATE_PATTERN = re.compile(m.DB_EVENT_LOG_DATE_REGEX) - - -def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: - """Parse ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` to a datetime. - - Returns ``None`` for bare ``eventlog`` and any name that does not match - the dated pattern. The caller sorts ``None`` as "latest/current" to - mirror Scala's ``getDBEventLogFileDate`` which defaults to ``now()``. - """ - if not name.startswith(m.DB_EVENT_LOG_FILE_PREFIX): - return None - match = _DB_DATE_PATTERN.match(name) - if match is None: - return None - year, month, day, hour, minute = (int(g) for g in match.groups()) - return datetime(year, month, day, hour, minute) - - -def _is_databricks_event_log_filename(name: str) -> bool: - return name.startswith(m.DB_EVENT_LOG_FILE_PREFIX) - - -def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: - """Resolve ``path`` to an ordered list of files to scan. - - Returns ``(source, files)`` where ``source`` is the original input - rendered as a string (preserved for the ``DetectionResult``) and - ``files`` is the scan order. - """ - source = str(path) - - if path.is_file(): - return source, [path] - - if not path.is_dir(): - raise UnsupportedInputError( - f"Path is neither a file nor a directory: {source}" - ) - - # Directory: must be a Databricks rolling dir. Spark-native rolling - # (eventlog_v2_*) and generic multi-app directories are out of scope. - children = CspFs.list_all_files(path) - db_files = [c for c in children if _is_databricks_event_log_filename(c.base_name())] - if not db_files: - raise UnsupportedInputError( - f"Directory {source} is not a supported input shape. The detector " - "handles single files or Databricks rolling directories only; fall " - "back to the full pipeline for Spark-native rolling, multi-app " - "directories, wildcards, or comma-separated inputs." - ) - - # Sort mirroring DatabricksRollingEventLogFilesFileReader: dated files - # ascending by parsed datetime, bare `eventlog` last (treated as - # "latest/current"). Stable sort on filename first to keep ordering - # deterministic among equal-date files (extremely unlikely in practice - # but cheap insurance for tests). - db_files.sort(key=lambda f: f.base_name()) - db_files.sort( - key=lambda f: ( - _parse_databricks_file_datetime(f.base_name()) or datetime.max, - ) - ) - return source, db_files -``` - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py -v` -Expected: all tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py -git commit -m "feat(eventlog_detector): single-file and Databricks rolling-dir resolver" -``` - ---- - -## Task 8: Event scanner (streaming, bounded, multi-file) - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py` - -- [ ] **Step 1: Write the failing test** - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Unit tests for ``eventlog_detector.scanner``.""" - -from pathlib import Path -from typing import List - -import pytest - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.scanner import ( - _scan_events, - _scan_events_across, -) -from spark_rapids_tools.tools.eventlog_detector.types import ( - SparkRuntime, - Termination, -) - - -# ---------- Line builders ---------- - -def env_update(props: dict) -> str: - import json - - return json.dumps( - { - "Event": "SparkListenerEnvironmentUpdate", - "Spark Properties": props, - "System Properties": {}, - "Classpath Entries": {}, - "JVM Information": {}, - } - ) - - -def log_start(version: str = "3.5.1") -> str: - import json - - return json.dumps({"Event": "SparkListenerLogStart", "Spark Version": version}) - - -def app_start(app_id: str = "app-1", app_name: str = "App") -> str: - import json - - return json.dumps( - { - "Event": "SparkListenerApplicationStart", - "App ID": app_id, - "App Name": app_name, - } - ) - - -def sql_exec_start(modified_configs: dict) -> str: - import json - - return json.dumps( - { - "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", - "executionId": 0, - "description": "x", - "details": "", - "physicalPlanDescription": "", - "sparkPlanInfo": {}, - "time": 0, - "modifiedConfigs": modified_configs, - } - ) - - -# ---------- Tests for _scan_events (single stream) ---------- - -class TestScanEvents: - def test_env_update_with_gpu_is_decisive(self): - lines = iter( - [ - log_start(), - app_start(), - env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - ] - ) - result = _scan_events(lines, budget=100) - assert result.env_update_seen is True - assert result.app_id == "app-1" - assert result.spark_version == "3.5.1" - assert result.termination is Termination.DECISIVE - - def test_env_update_cpu_then_sql_start_flips_to_gpu(self): - lines = iter( - [ - log_start(), - app_start(), - env_update({"spark.master": "local"}), - sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - ] - ) - result = _scan_events(lines, budget=100) - assert result.termination is Termination.DECISIVE - - def test_cpu_only_to_eof_is_exhausted(self): - lines = iter([log_start(), app_start(), env_update({"spark.master": "local"})]) - result = _scan_events(lines, budget=100) - assert result.env_update_seen is True - assert result.termination is Termination.EXHAUSTED - - def test_no_env_update_within_budget_is_cap_hit(self): - # Budget less than the number of events, none of them env-update. - lines = iter([log_start()] * 5) - result = _scan_events(lines, budget=2) - assert result.env_update_seen is False - assert result.termination is Termination.CAP_HIT - - def test_no_env_update_to_eof_is_exhausted_without_env(self): - lines = iter([log_start(), app_start()]) - result = _scan_events(lines, budget=100) - assert result.env_update_seen is False - assert result.termination is Termination.EXHAUSTED - - def test_malformed_json_lines_are_skipped(self): - lines = iter( - [ - "not-json-at-all", - log_start(), - "", - app_start(), - env_update({"spark.master": "local"}), - ] - ) - result = _scan_events(lines, budget=100) - assert result.env_update_seen is True - assert result.app_id == "app-1" - - def test_later_sql_start_overwrites_earlier_property(self): - # last-write-wins merge, matching CacheablePropsHandler.mergeModifiedConfigs. - lines = iter( - [ - env_update({"spark.rapids.sql.enabled": "false", "spark.plugins": "com.nvidia.spark.SQLPlugin"}), - sql_exec_start({"spark.rapids.sql.enabled": "true"}), - ] - ) - result = _scan_events(lines, budget=100) - assert result.termination is Termination.DECISIVE - # Final accumulated props reflect the merge. - assert result.spark_properties["spark.rapids.sql.enabled"] == "true" - - -# ---------- Tests for _scan_events_across (multi-file) ---------- - -def _write(path: Path, lines: List[str]) -> CspPath: - path.write_text("\n".join(lines) + "\n", encoding="utf-8") - return CspPath(str(path)) - - -class TestScanEventsAcross: - def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): - f1 = _write( - tmp_path / "eventlog-2021-06-14--18-00", - [log_start(), app_start(), env_update({"spark.master": "local"})], - ) - f2 = _write( - tmp_path / "eventlog-2021-06-14--20-00", - [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})], - ) - result = _scan_events_across([f1, f2], budget=100) - assert result.termination is Termination.DECISIVE - - def test_shared_budget_applied_across_files(self, tmp_path): - # 3 events in first file, 3 in second. Budget = 4. Second file stops - # after one event, before any GPU signal. - f1 = _write(tmp_path / "a", [log_start(), app_start(), env_update({"spark.master": "local"})]) - f2 = _write( - tmp_path / "b", - [ - sql_exec_start({"spark.master": "still-cpu"}), - sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - sql_exec_start({"x": "y"}), - ], - ) - result = _scan_events_across([f1, f2], budget=4) - assert result.termination is Termination.CAP_HIT - - def test_all_files_exhausted_returns_exhausted(self, tmp_path): - f1 = _write(tmp_path / "a", [env_update({"spark.master": "local"})]) - result = _scan_events_across([f1], budget=100) - assert result.termination is Termination.EXHAUSTED -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py -v` -Expected: `ModuleNotFoundError`. - -- [ ] **Step 3: Implement the scanner** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Bounded streaming event scanner. - -Walks JSON-per-line event logs under a shared event budget, merges -properties from startup and per-SQL events into a single mutable dict, -and reports early-stop/exhausted/cap-hit termination. Strictly streaming: -no slurping, no accumulating raw events. -""" - -import json -from dataclasses import dataclass, field -from typing import Dict, Iterable, Iterator, List, Optional - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector import markers as m -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime -from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream -from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, Termination - - -@dataclass -class _ScanResult: - spark_properties: Dict[str, str] = field(default_factory=dict) - app_id: Optional[str] = None - app_name: Optional[str] = None - spark_version: Optional[str] = None - env_update_seen: bool = False - events_scanned: int = 0 - termination: Termination = Termination.EXHAUSTED - - -def _scan_events( - lines: Iterable[str], - *, - budget: int, - state: Optional[_ScanResult] = None, -) -> _ScanResult: - """Scan one stream of lines, optionally continuing from a prior state. - - Returns the updated ``_ScanResult``. Terminates as soon as classification - turns non-SPARK (``DECISIVE``), or when the budget is exhausted - (``CAP_HIT``), or when ``lines`` is fully consumed (``EXHAUSTED``). - """ - result = state if state is not None else _ScanResult() - - for raw in lines: - if result.events_scanned >= budget: - result.termination = Termination.CAP_HIT - return result - - if not raw: - continue - - try: - event = json.loads(raw) - except (json.JSONDecodeError, ValueError): - # Malformed JSON line (Spark tolerates trailing partials). Count - # it against the budget so a pathological log can't stall us. - result.events_scanned += 1 - continue - - result.events_scanned += 1 - name = event.get("Event") - if name == m.EVENT_LOG_START: - version = event.get("Spark Version") - if isinstance(version, str): - result.spark_version = version - elif name == m.EVENT_APPLICATION_START: - app_id = event.get("App ID") - app_name = event.get("App Name") - if isinstance(app_id, str): - result.app_id = app_id - if isinstance(app_name, str): - result.app_name = app_name - elif name == m.EVENT_ENVIRONMENT_UPDATE: - props = event.get("Spark Properties") or {} - if isinstance(props, dict): - for k, v in props.items(): - if isinstance(k, str) and isinstance(v, str): - result.spark_properties[k] = v - result.env_update_seen = True - if _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK: - result.termination = Termination.DECISIVE - return result - elif name in (m.EVENT_SQL_EXECUTION_START, m.EVENT_SQL_EXECUTION_START_SHORTNAME): - modified = event.get("modifiedConfigs") or {} - if isinstance(modified, dict) and modified: - for k, v in modified.items(): - if isinstance(k, str) and isinstance(v, str): - result.spark_properties[k] = v - if result.env_update_seen and ( - _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK - ): - result.termination = Termination.DECISIVE - return result - - # Fully consumed without early-stop or budget exhaustion. - result.termination = Termination.EXHAUSTED - return result - - -def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: - """Walk ``files`` in order under a single shared ``budget``.""" - state = _ScanResult() - for path in files: - if state.events_scanned >= budget: - state.termination = Termination.CAP_HIT - return state - with _open_event_log_stream(path) as lines: - state = _scan_events(lines, budget=budget, state=state) - if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): - return state - # All files consumed. - state.termination = Termination.EXHAUSTED - return state -``` - -- [ ] **Step 4: Run the test and confirm it passes** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py -v` -Expected: all tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py -git commit -m "feat(eventlog_detector): bounded streaming scanner across files" -``` - ---- - -## Task 9: Top-level `detect_spark_runtime` - -**Files:** - -- Create: `user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py` -- Test: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py` - -- [ ] **Step 1: Write the failing integration test** - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Integration tests for ``eventlog_detector.detect_spark_runtime``.""" - -import gzip -import json -from pathlib import Path - -import pytest -import zstandard as zstd - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime -from spark_rapids_tools.tools.eventlog_detector.types import ( - Route, - SparkRuntime, - UnsupportedInputError, -) - - -def env_update(props: dict) -> dict: - return { - "Event": "SparkListenerEnvironmentUpdate", - "Spark Properties": props, - "System Properties": {}, - "Classpath Entries": {}, - "JVM Information": {}, - } - - -def _write_plain_log(path: Path, events: list) -> None: - path.write_text( - "\n".join(json.dumps(e) for e in events) + "\n", encoding="utf-8" - ) - - -class TestAcceptsStringPath: - def test_str_input_resolves(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log( - log, - [ - {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, - {"Event": "SparkListenerApplicationStart", "App ID": "a", "App Name": "A"}, - env_update({"spark.master": "local"}), - ], - ) - result = detect_spark_runtime(str(log)) - assert result.route is Route.QUALIFICATION - assert result.spark_runtime is SparkRuntime.SPARK - - -class TestGpuLog: - def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log( - log, - [ - {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, - {"Event": "SparkListenerApplicationStart", "App ID": "g", "App Name": "G"}, - env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - ], - ) - result = detect_spark_runtime(CspPath(str(log))) - assert result.route is Route.PROFILING - assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS - assert result.app_id == "g" - assert result.spark_version == "3.5.1" - - -class TestCapHit: - def test_no_env_update_before_cap_is_unknown(self, tmp_path): - log = tmp_path / "eventlog" - # Many LogStart events, no env-update. Cap hits first. - _write_plain_log( - log, - [{"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}] * 10, - ) - result = detect_spark_runtime(str(log), max_events_scanned=5) - assert result.route is Route.UNKNOWN - assert result.spark_runtime is None - assert "no decisive signal" in result.reason - - -class TestDatabricksRolling: - def test_gpu_in_later_rolled_file(self, tmp_path): - d = tmp_path / "dbrolling" - d.mkdir() - _write_plain_log( - d / "eventlog-2021-06-14--18-00", - [ - {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, - {"Event": "SparkListenerApplicationStart", "App ID": "d", "App Name": "D"}, - env_update({"spark.master": "local"}), - ], - ) - _write_plain_log( - d / "eventlog", - [ - { - "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", - "executionId": 0, - "description": "", - "details": "", - "physicalPlanDescription": "", - "sparkPlanInfo": {}, - "time": 0, - "modifiedConfigs": {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, - } - ], - ) - result = detect_spark_runtime(CspPath(str(d))) - assert result.route is Route.PROFILING - assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS - - -class TestUnsupportedInput: - def test_spark_native_rolling_dir_raises(self, tmp_path): - d = tmp_path / "eventlog_v2_local-1623876083964" - d.mkdir() - (d / "events_1_local-1623876083964").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - detect_spark_runtime(CspPath(str(d))) - - -class TestReasonStrings: - def test_reason_mentions_runtime_on_profiling(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log( - log, - [ - env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - ], - ) - result = detect_spark_runtime(str(log)) - assert "SPARK_RAPIDS" in result.reason - - def test_reason_mentions_full_log_on_qualification(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log(log, [env_update({"spark.master": "local"})]) - result = detect_spark_runtime(str(log)) - assert result.route is Route.QUALIFICATION - assert "walked full log" in result.reason -``` - -- [ ] **Step 2: Run the test and confirm it fails** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py -v` -Expected: `ModuleNotFoundError` (detector module missing). - -- [ ] **Step 3: Implement the detector** - -Create `user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Top-level event log runtime detector. - -``detect_spark_runtime(path)`` returns a ``DetectionResult`` carrying the -routing decision and best-effort metadata. On inconclusive input, returns -``Route.UNKNOWN`` rather than raising; callers fall back to the full -Scala pipeline in that case. -""" - -from typing import Optional, Union - -from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime -from spark_rapids_tools.tools.eventlog_detector.resolver import _resolve_event_log_files -from spark_rapids_tools.tools.eventlog_detector.scanner import _scan_events_across -from spark_rapids_tools.tools.eventlog_detector.types import ( - DetectionResult, - Route, - SparkRuntime, - Termination, -) - - -_GPU_FAMILY = frozenset({SparkRuntime.SPARK_RAPIDS, SparkRuntime.PHOTON, SparkRuntime.AURON}) - - -def detect_spark_runtime( - event_log: Union[str, CspPath], - *, - max_events_scanned: int = 500, -) -> DetectionResult: - """Classify a single-app event log into a routing decision. - - Returns ``DetectionResult`` with: - - * ``route`` = ``PROFILING`` for any decisive non-SPARK classification, - * ``QUALIFICATION`` only after the scanner walked the full log with - no GPU-family signal, - * ``UNKNOWN`` when the event budget was hit first or - ``SparkListenerEnvironmentUpdate`` was never seen. - - ``max_events_scanned`` caps CPU/IO cost; large CPU logs routinely end - as ``UNKNOWN`` at the cap. Raise the cap at the call site to trade - cost for decisiveness. - """ - path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) - source, files = _resolve_event_log_files(path) - - scan = _scan_events_across(files, budget=max_events_scanned) - - # Classify from whatever we accumulated. - runtime: Optional[SparkRuntime] - if scan.env_update_seen: - runtime = _classify_runtime(scan.spark_properties) - else: - runtime = None - - # Apply the asymmetric decision rule. - if runtime in _GPU_FAMILY: - route = Route.PROFILING - reason = f"decisive: classified as {runtime.value}" - elif scan.termination is Termination.EXHAUSTED and scan.env_update_seen: - route = Route.QUALIFICATION - reason = "walked full log, no GPU-family signal" - else: - route = Route.UNKNOWN - reason = ( - "no decisive signal within bounded scan" - if scan.env_update_seen - else "no SparkListenerEnvironmentUpdate reached" - ) - - resolved_path = str(files[0]) if files else source - return DetectionResult( - route=route, - spark_runtime=runtime, - app_id=scan.app_id, - spark_version=scan.spark_version, - event_log_path=resolved_path, - source_path=source, - reason=reason, - ) -``` - -- [ ] **Step 4: Run the tests** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py -v` -Expected: all tests pass. - -- [ ] **Step 5: Sanity-check the package-level re-exports** - -Run: `python -c "from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime, Route, DetectionResult; print('ok')"` -Expected: prints `ok`. - -- [ ] **Step 6: Commit** - -```bash -git add user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py -git commit -m "feat(eventlog_detector): top-level detect_spark_runtime entry point" -``` - ---- - -## Task 10: Fixture tests against existing Scala event logs - -**Files:** - -- Create: `user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py` - -- [ ] **Step 1: Confirm the fixtures exist and are readable** - -Run these sanity checks: - -```bash -ls core/src/test/resources/spark-events-profiling/eventlog-gpu-dsv2.zstd -ls core/src/test/resources/spark-events-profiling/eventlog_dsv2.zstd -ls core/src/test/resources/spark-events-qualification/eventlog_same_app_id_1.zstd -``` - -Expected: all three paths print. If any is missing, add it to the `pytest.mark.skip` decorator in the test below and note it in Section 12 of the spec (open items). - -- [ ] **Step 2: Write the fixture tests** - -Create `test_detector_fixtures.py`: - -```python -# Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Anchor tests against fixtures already shipped in the Scala core. - -These are not a full parity sweep. They catch regressions on a small -curated set covering each decisive route. -""" - -from pathlib import Path - -import pytest - -from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime -from spark_rapids_tools.tools.eventlog_detector.types import Route, SparkRuntime - - -REPO_ROOT = Path(__file__).resolve().parents[5] -CORE_FIXTURES = REPO_ROOT / "core" / "src" / "test" / "resources" - - -@pytest.mark.parametrize( - "relative_path,expected_route,expected_runtime", - [ - ( - "spark-events-profiling/eventlog-gpu-dsv2.zstd", - Route.PROFILING, - SparkRuntime.SPARK_RAPIDS, - ), - ( - "spark-events-profiling/eventlog_dsv2.zstd", - Route.QUALIFICATION, - SparkRuntime.SPARK, - ), - ( - "spark-events-qualification/eventlog_same_app_id_1.zstd", - Route.QUALIFICATION, - SparkRuntime.SPARK, - ), - ], -) -def test_detector_matches_expected_route_on_scala_fixture( - relative_path: str, expected_route: Route, expected_runtime: SparkRuntime -) -> None: - fixture = CORE_FIXTURES / relative_path - if not fixture.exists(): - pytest.skip(f"fixture not available: {fixture}") - # Fixtures are ~small; a generous budget keeps this test decisive. - result = detect_spark_runtime(str(fixture), max_events_scanned=5000) - assert result.route is expected_route, result.reason - assert result.spark_runtime is expected_runtime, result.reason -``` - -- [ ] **Step 3: Run the fixture tests** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py -v` -Expected: all cases pass (or skip with a clear reason if a fixture is missing). - -- [ ] **Step 4: Commit** - -```bash -git add user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py -git commit -m "test(eventlog_detector): anchor parity tests on Scala fixtures" -``` - ---- - -## Task 11: Run the full test suite and linter - -**Files:** none modified. - -- [ ] **Step 1: Run every detector test** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/ -v` -Expected: all tests pass (ignore tests that skipped with "fixture not available"). - -- [ ] **Step 2: Run the full user_tools test suite to catch unintended regressions** - -Run: `pytest user_tools/tests/spark_rapids_tools_ut/ -x` -Expected: same pass/fail baseline as before this branch (nothing that was green turns red). - -- [ ] **Step 3: Run the project's linter against the new files** - -Run (adjust if the project uses a different runner): - -``` -pylint user_tools/src/spark_rapids_tools/tools/eventlog_detector/ \ - user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/ -``` - -Expected: no new errors or warnings above the project's existing baseline. If the project pins a specific pylint config in `pyproject.toml` or `.pylintrc`, read that file first and rerun with the matching options. If `pylint` isn't configured here, substitute the tool the repo actually uses (check `pyproject.toml` or `user_tools/dev/` for hooks). - -- [ ] **Step 4: Commit any docstring/type-hint fixups surfaced by the linter** - -If fixups were needed, stage only the files the linter touched and commit: - -```bash -git add -u -git commit -m "chore(eventlog_detector): resolve lint findings from full-suite run" -``` - -If no changes were required, skip the commit. - ---- - -## Task 12: Update plan doc with the realized inventory - -**Files:** - -- Modify: `docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md` - -- [ ] **Step 1: Replace the placeholder in Section 12 (Open items)** - -The spec currently says: - -``` -- Parity-test fixture inventory: enumerate every file under `core/src/test/resources/spark-events-*` and record the expected `SparkRuntime` label, derived from existing Scala test expectations. The plan step owns this list. -``` - -Replace with the actual list used in Task 10: - -``` -- Parity-test fixture inventory (realized): - - `spark-events-profiling/eventlog-gpu-dsv2.zstd` → `PROFILING` / `SPARK_RAPIDS` - - `spark-events-profiling/eventlog_dsv2.zstd` → `QUALIFICATION` / `SPARK` - - `spark-events-qualification/eventlog_same_app_id_1.zstd` → `QUALIFICATION` / `SPARK` -``` - -Use Edit with the old text as `old_string`. - -- [ ] **Step 2: Commit** - -```bash -git add docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md -git commit -m "docs(spec): record realized fixture inventory for event log detector" -``` - ---- - -## Self-Review - -**Spec coverage:** - -- Sections 1–4 (problem, goal, non-goals, consumers) → no code; covered by the plan's Goal/Architecture header. -- Section 5 (public API) → Tasks 2, 3, 9 (re-exports, types, entry point). -- Section 6 (input shapes, codecs) → Tasks 6, 7. -- Section 7.1 (resolver) → Task 7. -- Section 7.2 (stream opener) → Task 6. -- Section 7.3 (scanner + termination) → Task 8. -- Section 7.4 (classifier) → Task 5. -- Section 7.5 (top-level flow + asymmetric rule) → Task 9. -- Section 7.6 (markers) → Task 4. -- Section 8 (classification rules) → Task 5 (classifier) + Task 4 (markers). -- Section 9 (error model) → Task 3 (types) + all tasks that raise. -- Section 10 (testing) → every task's test plus Task 10 (fixture anchor) and Task 11 (full-suite run). -- Section 11 (rollout) → this plan produces one PR. -- Section 12 open item (`zstandard` RELEASE.md note) → not addressed; treat as a follow-up for the release engineer. Section 12 open item (fixture inventory) → Task 12. -- Sections 13 (evolution) → doc-only, no code. -- Memory contract (section 7 header) → Task 6 (no `read()`/`readlines()`; streaming only) + Task 8 (no raw-event accumulation). -- Decision rule (section 2) → Task 9. - -**Placeholder scan:** every step shows exact paths, full file contents for new files, and specific commands. No "TODO", "fill in later", "similar to Task N", or bare "add appropriate handling" phrases. - -**Type/signature consistency:** - -- `Route`, `SparkRuntime`, `Termination`, `DetectionResult`, and the exception classes are defined in Task 3 and used verbatim in Tasks 5, 6, 7, 8, 9, 10 — same names and field orders throughout. -- `_resolve_event_log_files` (plural) is defined in Task 7 and called in Task 9. -- `_scan_events` / `_scan_events_across` signatures match between Task 8's implementation and Task 9's caller. -- `_open_event_log_stream` is a context manager in Task 6; Tasks 8 and 9 use it with `with`. -- `_ScanResult` fields (`spark_properties`, `app_id`, `app_name`, `spark_version`, `env_update_seen`, `events_scanned`, `termination`) are consistent between Task 8 and the read-sites in Task 9. - -No gaps found. \ No newline at end of file From 3871a14cd3295b5fc9a36e43bd054155856bfe09 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 19:03:54 -0700 Subject: [PATCH 24/29] docs: remove local design spec from the PR The design spec was a development artifact and does not belong in the published change set. Module- and function-level docstrings describe the detector contract in the code itself. Signed-off-by: Sayed Bilal Bari --- ...-04-22-eventlog-runtime-detector-design.md | 288 ------------------ 1 file changed, 288 deletions(-) delete mode 100644 docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md diff --git a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md b/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md deleted file mode 100644 index e41863bae..000000000 --- a/docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md +++ /dev/null @@ -1,288 +0,0 @@ -# Lightweight Event Log Runtime Detector — Design - -- **Issue:** [NVIDIA/spark-rapids-tools#2082](https://github.com/NVIDIA/spark-rapids-tools/issues/2082) -- **Branch:** `sbari-issue-2082` -- **Date:** 2026-04-22 -- **Status:** Design approved, ready for implementation plan - -## 1. Problem - -Determining whether a Spark event log came from a CPU or GPU run currently requires running the full qualification or profiling tool. That is expensive, and it is the wrong way to make the pre-flight decision of *which* tool to run. - -aether (a consumer of spark-rapids-tools) needs that decision per job. Its current workaround is running one tool, reading `application_information.csv`'s `sparkRuntime`, and picking the right tool next — meaning every routing decision costs a full qualification run. - -## 2. Goal - -Provide a lightweight Python function that reads a bounded prefix of a Spark event log and returns a **routing decision** (`QUALIFICATION`, `PROFILING`, or `UNKNOWN`) plus best-effort metadata (`spark_runtime`, `app_id`, `spark_version`). - -**This is best-effort early routing, not exact Scala parity.** On inconclusive input, the caller falls back to the full tool. - -**Asymmetric decision rule:** positive GPU evidence is decisive; absence of evidence in a bounded prefix is not. Concretely: - -- Decisive non-SPARK signal seen → `PROFILING`. -- No signal seen, but the scanner walked the whole log (or the whole ordered file list in a rolling dir) to EOF → `QUALIFICATION`. -- No signal seen and the `max_events_scanned` cap is hit first → `UNKNOWN`. - -This protects against the one dangerous failure mode (routing a GPU log to qualification output that gets fed into pipelines expecting CPU results) by never saying `QUALIFICATION` on truncated evidence. - -## 3. Non-goals (V1) - -- Replacing the Scala event log reader. Multi-app directories, wildcards, comma lists, malformed logs, the full codec set, and most CSP quirks stay in Scala. -- Exact classification parity with Scala. Scala can adjust runtime state from events far into the log (job-level plugin re-evaluation, SQL-level `modifiedConfigs`, per-SQL merges). The Python detector only looks at a bounded prefix; the caller must be comfortable with `UNKNOWN` for genuinely ambiguous inputs. -- Public user-facing CLI. Dev/library scope only. -- Writing any output files. Result is a Python object returned in-process. - -## 4. Consumers - -- **Primary:** aether (`aether-services`). aether already consumes `spark_rapids_tools` as a library. It will import `detect_spark_runtime` and branch on the returned `Route` before deciding to run qualification or profiling. On `Route.UNKNOWN` or any raised error, aether falls back to running qualification (its current default). -- **Secondary:** internal scripting / debugging. Devs can `python -c "..."` to inspect a log. - -## 5. Public API - -Module: `user_tools/src/spark_rapids_tools/tools/eventlog_detector.py`. - -```python -from enum import Enum -from dataclasses import dataclass -from typing import Optional, Union -from spark_rapids_tools.storagelib.csppath import CspPath - -class Route(str, Enum): - QUALIFICATION = "QUALIFICATION" # CPU log → run qualification - PROFILING = "PROFILING" # GPU/Photon/Auron log → run profiling - UNKNOWN = "UNKNOWN" # not enough signal; caller falls back - -class SparkRuntime(str, Enum): - SPARK = "SPARK" - SPARK_RAPIDS = "SPARK_RAPIDS" - PHOTON = "PHOTON" - AURON = "AURON" - -@dataclass(frozen=True) -class DetectionResult: - route: Route - spark_runtime: Optional[SparkRuntime] # best-effort; may be None when route is UNKNOWN - app_id: Optional[str] - spark_version: Optional[str] - event_log_path: str # concrete file actually read - source_path: str # caller input (preserves rolling-dir input) - reason: str # short human-readable explanation - -def detect_spark_runtime( - event_log: Union[str, CspPath], - *, - max_events_scanned: int = 500, -) -> DetectionResult: ... -``` - -`SparkRuntime` string values match the Scala enum so aether's existing `JobRun.spark_runtime` column stays compatible. - -## 6. Scope of input shapes (V1) - -| Input shape | Supported | Notes | -| --- | --- | --- | -| Single event log file (plain or compressed) | ✅ | Primary case | -| Databricks rolling dir (`eventlog` + optional `eventlog-` files) | ✅ | Required by aether — see 7.1 for the selection rule | -| Spark native rolling dir (`eventlog_v2_*`) | ❌ | Out of scope for V1. Raises `UnsupportedInputError`. Caller falls back. | -| Generic directory of independent logs | ❌ | Same | -| Wildcard path | ❌ | Same | -| Comma-separated list | ❌ | Same | - -**Supported compression codecs:** plain, `.gz`, `.zstd`/`.zst`. Anything else (`.lz4`, `.lzf`, `.snappy`, unknown) raises `UnsupportedCompressionError` → caller falls back. - -`zstandard` becomes a required dependency (existing test fixtures use `.zstd`; the common Spark default). No new optional extras. - -## 7. Module structure - -Four small, independently testable components inside `eventlog_detector.py` (plus a markers file). - -**Memory contract (applies to every component below):** - -- No full-file reads. No `read()`, no `readlines()`, no slurping a log into memory. All I/O is line-at-a-time streaming via `_open_event_log_stream`'s iterator. -- No accumulation of raw events. Each parsed JSON line is inspected, the relevant fields are merged into state, and the line is discarded before moving on. -- Retained state per invocation is bounded and small: a handful of scalars (`app_id`, `app_name`, `spark_version`, `env_update_seen`, a running event counter, a termination enum) plus one mutable `spark_properties: dict[str, str]` that grows only with env-update values and later `modifiedConfigs` merges. Nothing else is held across iterations. -- The scanner must never buffer the full list of seen events; it walks, updates state, and moves on. - -This is what the "lightweight" claim actually rests on. Any implementation change that accumulates per-event data must be reviewed against this contract. - -### 7.1 `_resolve_event_log_files(path) -> (source, ordered_files)` - -Path resolver. Turns user input into an ordered list of one or more concrete files to read. - -- File input → return `[file]`. -- Directory input → Databricks-rolling shape only: - - Use `CspFs.list_all_files(dir_path)` to list children. - - Recognize Databricks files by the same prefix Scala uses in `EventLogPathProcessor.isDBEventLogFile` (`DB_EVENT_LOG_FILE_NAME_PREFIX = "eventlog"`). - - Sort them exactly like Scala's `DatabricksRollingEventLogFilesFileReader` (`core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:458-478, 496-500`): parse `LocalDateTime` from the `eventlog-YYYY-MM-DD--HH-MM[.codec]` pattern; bare `eventlog` (no `--date`) is treated as the latest and sorted last. - - Return the full ordered list. The scanner walks them in order under one shared event budget; earliest file carries startup events, but GPU markers that appear via later `SQLExecutionStart.modifiedConfigs` can live in subsequent files. - - If the directory contains no Databricks-pattern files → raise `UnsupportedInputError`. -- Anything else (Spark-native rolling, generic multi-app dir, wildcard, comma list) → raise `UnsupportedInputError`. -- Pattern matching only — no file reads for the shape decision. - -### 7.2 `_open_event_log_stream(resolved_path)` — context manager yielding `Iterator[str]` - -Stream opener. Context manager (implemented with `@contextlib.contextmanager`) that opens the file with the right codec, yields an iterator of decoded text lines, and closes the underlying stream on exit. - -Usage shape: - -```python -with _open_event_log_stream(resolved_path) as lines: - for line in lines: - ... -``` - -- Codec chosen by extension: plain / `.inprogress` / `.gz` / `.zstd` / `.zst`. Anything else → `UnsupportedCompressionError`. -- Cloud paths use `CspPath.open_input_stream()` (returns a closable byte stream); the byte stream is wrapped by the codec reader, then by a text decoder (`io.TextIOWrapper` or equivalent) yielding lines. The context-manager wrapper owns closing all three layers. - -### 7.3 `_scan_events(lines, max_events) -> _ScanResult` - -Event scanner. Parses lines as JSON and accumulates classification-relevant properties. - -**Events consumed:** - -| Event | What it contributes | -| --- | --- | -| `SparkListenerLogStart` | Spark version | -| `SparkListenerApplicationStart` | appId, appName | -| `SparkListenerEnvironmentUpdate` | Seeds `spark_properties` from its `Spark Properties` section | -| `SparkListenerSQLExecutionStart` | Merges `modifiedConfigs` into `spark_properties` (last-write-wins) — covers logs where `spark.plugins` / `spark.rapids.sql.enabled` are set per-SQL rather than at startup | - -We intentionally do not track `SparkListenerJobStart` job-level properties. That would catch the DB plugin's job-level re-evaluation path, but that is a narrow Scala case and pushing further into the log moves us away from "lightweight." If a log is truly Databricks-only-detectable at job-start time, the caller's fallback path handles it. - -**Stop conditions and termination mode:** the scanner walks the ordered file list (one file for a plain input, multiple for a Databricks rolling dir) under a single shared budget `max_events_scanned`. - -- Early-stop (`Termination.DECISIVE`): as soon as `_classify_runtime(spark_properties)` returns anything other than `SPARK`. The signal is decisive; plugins are sticky-true in Scala (once set, stay set — `AppPropPlugTrait:66-68`). -- Walked-to-end (`Termination.EXHAUSTED`): the final file's EOF is reached before the cap. We have seen the entire log. -- Cap-reached (`Termination.CAP_HIT`): the cap hit before exhausting the files. -- `max_events_scanned` default `500`. Startup events land in the first ~20; the rest is headroom for the first few `SQLExecutionStart` merges and any tail files. - -**What the cap is actually for:** `max_events_scanned` is the primary protection against CPU-time and I/O blowups on large logs, not just a tie-breaker for ambiguity. Big CPU logs will routinely hit the cap before EOF and therefore terminate as `CAP_HIT`, which maps to `Route.UNKNOWN`. That is intentional — the detector refuses to speculate, and the caller falls back to the full tool. Users who want to convert more of their `UNKNOWN`s to `QUALIFICATION` can raise the cap at the call site, accepting the proportional increase in cost. - -**Malformed input:** lines that aren't valid JSON are skipped. - -**Returned state:** `(spark_properties, app_id, app_name, spark_version, env_update_seen, termination)` where `termination` is one of the three modes above. - -### 7.4 `_classify_runtime(spark_properties) -> SparkRuntime` - -Pure function over the accumulated properties dict. See section 8 for rules. - -### 7.5 Top-level flow - -``` -detect_spark_runtime(path): - source, ordered_files = _resolve_event_log_files(path) - scan = _scan_events_across(ordered_files, max_events_scanned) - - runtime = _classify_runtime(scan.spark_properties) if scan.env_update_seen else None - - # Decision rule (asymmetric — see section 2): - if runtime in {SPARK_RAPIDS, PHOTON, AURON}: - route, reason = PROFILING, f"decisive: classified as {runtime.value}" - elif scan.termination == EXHAUSTED and scan.env_update_seen: - route, reason = QUALIFICATION, "walked full log, no GPU signal" - else: - # CAP_HIT, or env-update never seen. Do not promote absence to CPU. - route, reason = UNKNOWN, "no decisive signal within bounded scan" - - return DetectionResult(route=route, spark_runtime=runtime, ..., reason=reason) -``` - -`_scan_events_across(ordered_files, budget)` is the thin wrapper that opens each file (via `_open_event_log_stream`) and feeds its lines into `_scan_events` while tracking the remaining global budget. It stops and returns as soon as the scanner reports `DECISIVE`, or when the budget is exhausted, or when the last file's EOF is reached. - -### 7.6 `eventlog_detector_markers.py` - -Single source of truth for keys/regex/substrings. Every constant carries a `# Scala source: :` comment. - -## 8. Classification rules - -Mapped from the same Scala sources. Priority order in Python: **PHOTON > AURON > SPARK_RAPIDS > SPARK**. - -**SPARK_RAPIDS** — from `ToolUtils.isPluginEnabled` (`core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121`): - -- `spark.plugins` contains substring `com.nvidia.spark.SQLPlugin` -- AND `spark.rapids.sql.enabled` parses as boolean true (default true if missing/unparseable) - -**AURON** — from `AuronParseHelper.eval` (`core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172`): - -- `spark.sql.extensions` fullmatches `.*AuronSparkSessionExtension.*` -- AND `spark.auron.enabled` trimmed equals `"true"` case-insensitively (default `"true"`) - -**PHOTON** — combined precondition + marker: - -- Databricks precondition (`DBConditionImpl.eval`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58`): all three of `spark.databricks.clusterUsageTags.clusterAllTags`, `.clusterId`, `.clusterName` are non-empty. -- AND any Photon marker (`PhotonParseHelper`, `core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151`) fullmatches: - - `spark.databricks.clusterUsageTags.sparkVersion` ~ `.*-photon-.*` - - `spark.databricks.clusterUsageTags.effectiveSparkVersion` ~ `.*-photon-.*` - - `spark.databricks.clusterUsageTags.sparkImageLabel` ~ `.*-photon-.*` - - `spark.databricks.clusterUsageTags.runtimeEngine` ~ `PHOTON` - -**SPARK** — none of the above matched on the accumulated properties. - -Python uses `re.fullmatch` (matches Scala's `String.matches` semantics). - -## 9. Error model - -All errors subclass `EventLogDetectionError`: - -| Exception | Meaning | aether action | -| --- | --- | --- | -| `UnsupportedInputError` | Input shape not supported (Spark-native rolling, multi-app dir, wildcard, comma list, empty dir) | Fall back to running the full tool | -| `UnsupportedCompressionError` | Codec outside the supported set | Fall back | -| `EventLogReadError` | I/O failure (wraps underlying error) | Fall back | - -**Note:** "scanner never saw env-update" is **not** an exception — it's a `DetectionResult` with `route=UNKNOWN`. This keeps the caller's happy path free of exception handling for the common "inconclusive log" case. - -## 10. Testing - -### 10.1 Unit tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector.py` - -- Path resolver: plain file → single-element list; Databricks rolling dir (multi-file with dated + bare `eventlog`) → ordered list with earliest first and bare `eventlog` last; Spark-native rolling dir raises; multi-app dir raises; wildcard raises. -- Stream opener: plain / gz / zstd each works and closes on exit; `.lz4`/`.snappy` raises. -- Event scanner: env-update only → classifies from it; env-update + later SQLExecutionStart that sets `spark.plugins` → classification updates to `SPARK_RAPIDS` and terminates DECISIVE; no env-update within cap → termination `CAP_HIT`; full-log scan with no GPU signal → termination `EXHAUSTED`; malformed JSON lines skipped. -- Multi-file scan: GPU marker in a later Databricks-rolling file → picked up under the shared budget; budget exhausted across files → `CAP_HIT`. -- Classifier: each of the four runtime outcomes, priority when multiple markers coexist, `spark.rapids.sql.enabled=false` override. -- Routing rule: DECISIVE + non-SPARK → `PROFILING`; EXHAUSTED with env-update + SPARK → `QUALIFICATION`; CAP_HIT → `UNKNOWN`; env-update never seen → `UNKNOWN`. - -### 10.2 Fixture tests — `tests/spark_rapids_tools_ut/tools/test_eventlog_detector_fixtures.py` - -Runs `detect_spark_runtime` against a small curated set of existing fixtures under `core/src/test/resources/spark-events-*`: - -- `spark-events-profiling/eventlog-gpu-dsv2.zstd` → `Route.PROFILING`, `SPARK_RAPIDS` -- `spark-events-profiling/eventlog_dsv2.zstd` → `Route.QUALIFICATION`, `SPARK` -- `spark-events-qualification/eventlog_same_app_id_1.zstd` → `Route.QUALIFICATION`, `SPARK` - -Fixture inventory above is the realized set used by `test_detector_fixtures.py`. Databricks-rolling-dir coverage is exercised via synthesized fixtures in `test_detector.py` rather than a pre-existing Scala fixture, since `core/src/test/resources/` has no Databricks-rolling-shape fixture we can repurpose here. - -Not a full parity sweep — just anchor points to catch regressions. - -## 11. Rollout - -- Single PR, `[FEA]` tag, references issue #2082. -- Additive only — no breaking changes. -- aether integration lands as a separate PR in `aether-services` after this merges. - -## 12. Open items for implementation plan - -- Whether adding `zstandard` as a hard dep needs a `RELEASE.md` note. - -## 13. Evolution - -This spec was reshaped once after review feedback. Earlier drafts attempted: - -- Full scan-scope parity with Scala (including `SparkListenerJobStart.properties` job-level plugin re-evaluation) — dropped. Documented as a known divergence case; caller handles it via fallback on `Route.UNKNOWN`. -- Spark-native rolling-dir support — dropped from V1. Aether's primary input is single files or Databricks rolling dirs. -- `.lz4` / `.lzf` / `.snappy` codec support with a new `[compression]` extra — dropped. Added scope without matching a real need. -- 4-way `SparkRuntime` return as the primary contract — kept as auxiliary metadata; primary contract is now the `Route` enum (the actual decision the caller makes). - -The narrower V1 keeps the detector honest about what it is: a best-effort fast path that gets out of the way when the log doesn't give it enough signal. - -**Fourth review pass (2026-04-22):** - -- **Explicit memory contract** (section 7). Stated up front that the detector is strictly streaming with no full-file reads, no raw-event accumulation, and a bounded per-invocation state (a few scalars plus one mutable `spark_properties` dict). Closes the door on a well-meaning implementation drifting into `read()` / `readlines()` / full-log buffering. -- **Cap framing** (section 7.3). `max_events_scanned` is documented as the primary cost cap, not just an ambiguity tie-breaker. Large CPU logs intentionally end as `UNKNOWN` at cap, which is expected behavior — callers that want a higher conversion to `QUALIFICATION` can raise the cap and accept the cost. - -**Third review pass (2026-04-22):** - -- **Asymmetric decision rule** (sections 2, 7.5). Previously the spec promoted "no GPU signal in prefix" to `QUALIFICATION`. Under Scala's late-promotion paths (`SQLExecutionStart.modifiedConfigs`, job-level plugin re-eval), that is unsafe. The rule now requires either a decisive GPU signal (→ `PROFILING`) or a fully-walked log with no GPU signal (→ `QUALIFICATION`). Cap-hit returns `UNKNOWN`. -- **Databricks rolling dir scans the full ordered list** (section 7.1, 7.5). Picking only the earliest file contradicted the scanner's expansion to handle `modifiedConfigs` (which can land in later rolled files). The resolver now returns the ordered file list and the scanner walks it under one shared event budget. -- **Stream opener is a context manager** (section 7.2). Previous signature said `Iterator[str]` while the top-level flow used it with `with`. Clarified as a `@contextmanager` that yields the iterator and owns closing the underlying streams. \ No newline at end of file From 4c1de131b9d608330345daa8a5008852e70c6338 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 19:15:12 -0700 Subject: [PATCH 25/29] docs(eventlog_detector): trim docstrings and drop internal references Tighten module/function docstrings and inline comments across the detector package: - Drop references to internal planning docs and consumer-specific projects. - Shorten verbose narrative in module docstrings to the essentials. - Keep the Scala source pointers in markers.py (they are the actionable pin-points for keeping the two sides in sync). - Collapse long "why we do X" comments where the surrounding code already makes the intent obvious. Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/__init__.py | 9 ++- .../tools/eventlog_detector/classifier.py | 19 +++---- .../tools/eventlog_detector/detector.py | 30 ++++------ .../tools/eventlog_detector/markers.py | 55 ++++++------------- .../tools/eventlog_detector/resolver.py | 44 +++++++-------- .../tools/eventlog_detector/scanner.py | 19 +++---- .../tools/eventlog_detector/stream.py | 52 +++++++----------- .../tools/eventlog_detector/types.py | 19 ++++--- 8 files changed, 98 insertions(+), 149 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py index 619f03bd3..c8a3ab1d0 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py @@ -14,12 +14,11 @@ """Lightweight event log runtime detector. -Public entry point: ``detect_spark_runtime``. Returns a ``DetectionResult`` -carrying a routing decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) -and best-effort metadata, by scanning a bounded prefix of a Spark event log. +Scans a bounded prefix of a Spark event log and returns a routing +decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) plus +best-effort runtime metadata, without invoking the full tools pipeline. -See docs/superpowers/specs/2026-04-22-eventlog-runtime-detector-design.md -for the full contract and the Scala sources this mirrors. +Public entry point: :func:`detect_spark_runtime`. """ from .detector import detect_spark_runtime diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py index 1bb0b9e48..c0745650b 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Pure-function runtime classifier. +"""Runtime classifier. -``_classify_runtime`` maps a merged Spark properties dict to a -``SparkRuntime``. Priority order when multiple markers are present: -``PHOTON > AURON > SPARK_RAPIDS > SPARK``. This is a deterministic Python -choice; Scala's plugin iteration order is undefined when multiple -plugins claim a runtime, but in practice markers do not overlap. +Maps a merged Spark properties dict to a :class:`SparkRuntime`. When +multiple markers are present the priority is +``PHOTON > AURON > SPARK_RAPIDS > SPARK``. """ import re @@ -29,12 +27,11 @@ def _parse_bool(raw: str, default: bool) -> bool: - """Mirror Scala's ``Try { s.toBoolean }.getOrElse(default)``. + """Parse ``"true"``/``"false"`` case-insensitively; anything else returns ``default``. - Scala's ``String.toBoolean`` accepts only ``"true"``/``"false"`` - case-insensitively. Everything else (including ``"yes"``, ``"no"``, - ``"1"``, ``"0"``) falls back to ``default`` because the Scala call - would throw ``IllegalArgumentException``. + Matches Scala's ``Try { s.toBoolean }.getOrElse(default)`` — Scala's + ``String.toBoolean`` only accepts the two literals and throws on + everything else. """ stripped = raw.strip().lower() if stripped == "true": diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py index faba18cb2..51cd4294e 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Top-level event log runtime detector. - -``detect_spark_runtime(path)`` returns a ``DetectionResult`` carrying the -routing decision and best-effort metadata. On inconclusive input, returns -``Route.UNKNOWN`` rather than raising; callers fall back to the full -Scala pipeline in that case. -""" +"""Top-level event log runtime detector.""" from typing import Optional, Union @@ -44,20 +38,18 @@ def detect_spark_runtime( ) -> DetectionResult: """Classify a single-app event log into a routing decision. - Returns ``DetectionResult`` with: - - * ``route`` = ``PROFILING`` for any decisive non-SPARK classification, - * ``QUALIFICATION`` only after the scanner walked the full log with - no GPU-family signal, - * ``UNKNOWN`` when the event budget was hit first or - ``SparkListenerEnvironmentUpdate`` was never seen. + Returns a :class:`DetectionResult` whose ``route`` is ``PROFILING`` on + a decisive non-SPARK classification, ``QUALIFICATION`` only after the + scanner walked the full log without seeing a GPU-family signal, and + ``UNKNOWN`` otherwise (e.g., the budget was hit first or the log never + emitted ``SparkListenerEnvironmentUpdate``). - ``max_events_scanned`` caps CPU/IO cost; large CPU logs routinely end - as ``UNKNOWN`` at the cap. Raise the cap at the call site to trade - cost for decisiveness. + ``max_events_scanned`` caps CPU/IO cost. Large CPU logs routinely end + as ``UNKNOWN`` at the cap; raise it at the call site to trade cost + for decisiveness. """ - # Preserve the original user-supplied string in the result's source_path - # so callers see their input back unchanged (including cloud URI schemes). + # Keep the caller's input verbatim in source_path (cloud URI schemes + # would otherwise be stripped by CspPath normalisation). source_path = event_log if isinstance(event_log, str) else str(event_log) path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) _, files = _resolve_event_log_files(path) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py index bae35423f..80e0f8a58 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py @@ -12,51 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Single source of truth for runtime-detection markers. +"""Property keys, regexes, and constants used by the runtime classifier. -Each constant below is pinned to a specific Scala source location. If -the Scala rule changes, update the constant here and the parity tests -under ``tests/spark_rapids_tools_ut/tools/eventlog_detector/`` will -catch any drift on a re-run. +Each block carries a Scala source reference so the two implementations +can be kept in sync when the Scala detection rules change. """ from typing import Mapping, Tuple -# --------------------------------------------------------------------------- -# SPARK_RAPIDS (GPU) markers -# Scala source: core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala:114-121 -# --------------------------------------------------------------------------- +# GPU (SPARK_RAPIDS) markers. +# Scala: org/apache/spark/sql/rapids/tool/ToolUtils.scala :: isPluginEnabled GPU_PLUGIN_KEY: str = "spark.plugins" GPU_PLUGIN_CLASS_SUBSTRING: str = "com.nvidia.spark.SQLPlugin" GPU_ENABLED_KEY: str = "spark.rapids.sql.enabled" -# Default when GPU_ENABLED_KEY is missing or unparseable as bool. Matches -# Scala ``Try { ... }.getOrElse(true)`` in isPluginEnabled. +# Defaults to true when missing or unparseable. GPU_ENABLED_DEFAULT: bool = True -# --------------------------------------------------------------------------- -# AURON markers -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala:149-172 -# --------------------------------------------------------------------------- +# AURON markers. +# Scala: com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala AURON_SPARK_EXTENSIONS_KEY: str = "spark.sql.extensions" AURON_EXTENSION_REGEX: str = r".*AuronSparkSessionExtension.*" AURON_ENABLED_KEY: str = "spark.auron.enabled" AURON_ENABLED_DEFAULT: str = "true" -# --------------------------------------------------------------------------- -# Databricks precondition (all three keys must be non-empty) -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala:45-58 -# and DatabricksParseHelper.scala:188-190 -# --------------------------------------------------------------------------- +# Databricks precondition — all three keys must be non-empty. +# Scala: com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala :: DBConditionImpl DB_PRECONDITION_KEYS: Tuple[str, str, str] = ( "spark.databricks.clusterUsageTags.clusterAllTags", "spark.databricks.clusterUsageTags.clusterId", "spark.databricks.clusterUsageTags.clusterName", ) -# --------------------------------------------------------------------------- -# PHOTON markers (any one fullmatches once Databricks precondition holds) -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala:146-151 -# --------------------------------------------------------------------------- +# Photon markers — any one fullmatches once the Databricks precondition holds. +# Scala: com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala :: PhotonParseHelper PHOTON_MARKER_REGEX: Mapping[str, str] = { "spark.databricks.clusterUsageTags.sparkVersion": r".*-photon-.*", "spark.databricks.clusterUsageTags.effectiveSparkVersion": r".*-photon-.*", @@ -64,26 +52,19 @@ "spark.databricks.clusterUsageTags.runtimeEngine": r"PHOTON", } -# --------------------------------------------------------------------------- -# Databricks rolling event-log file layout -# Scala source: core/src/main/scala/com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala:57 -# and :458-478 (date parse in getDBEventLogFileDate) -# --------------------------------------------------------------------------- +# Databricks rolling event-log file layout. +# Scala: com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala :: getDBEventLogFileDate DB_EVENT_LOG_FILE_PREFIX: str = "eventlog" -# Matches the dated form ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` used by -# ``DatabricksRollingEventLogFilesFileReader``. Bare ``eventlog`` has no -# match and is treated as "latest" (sorted last) by the resolver. +# ``eventlog-YYYY-MM-DD--HH-MM[.codec]``. Bare ``eventlog`` does not match +# and is treated as the latest chunk (sorted last) by the resolver. DB_EVENT_LOG_DATE_REGEX: str = ( r"^eventlog-(\d{4})-(\d{2})-(\d{2})--(\d{2})-(\d{2})(?:\.[A-Za-z0-9]+)?$" ) -# --------------------------------------------------------------------------- -# Supported Spark listener event names -# --------------------------------------------------------------------------- +# Spark listener event names consumed by the scanner. EVENT_LOG_START: str = "SparkListenerLogStart" EVENT_APPLICATION_START: str = "SparkListenerApplicationStart" EVENT_ENVIRONMENT_UPDATE: str = "SparkListenerEnvironmentUpdate" EVENT_SQL_EXECUTION_START: str = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" -# Spark's actual SQLExecutionStart class name carries a package prefix in -# event logs. The unqualified shortname is sometimes used in test fixtures. +# Unqualified shortname; sometimes used in test fixtures. EVENT_SQL_EXECUTION_START_SHORTNAME: str = "SparkListenerSQLExecutionStart" diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py index f5f448f46..2d72f8ff8 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Input-path resolution for the event log detector. +"""Input-path resolver. -Resolves one user-supplied path into a list of one or more concrete files -to read, in scan order. Supports a single file or a Databricks rolling -directory. Spark-native rolling, generic multi-app directories, wildcards, -and comma lists raise ``UnsupportedInputError`` and are expected to fall -back to the full Scala pipeline. +Resolves a user-supplied path into an ordered list of concrete files to +scan. Supports a single file or a Databricks rolling directory; any +other shape raises :class:`UnsupportedInputError` so the caller can +fall back to the full tools pipeline. """ import re @@ -34,11 +33,12 @@ def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: - """Parse ``eventlog-YYYY-MM-DD--HH-MM[.codec]`` to a datetime. + """Parse a Databricks rolled filename to its embedded datetime. - Returns ``None`` for bare ``eventlog`` and any name that does not match - the dated pattern. The caller sorts ``None`` as "latest/current" to - mirror Scala's ``getDBEventLogFileDate`` which defaults to ``now()``. + Returns ``None`` for bare ``eventlog`` (the current/latest chunk) and + for any name that does not match the dated pattern; callers sort + ``None`` last to mirror Scala's ``getDBEventLogFileDate`` which + defaults the bare file to ``now()``. """ if not name.startswith(m.DB_EVENT_LOG_FILE_PREFIX): return None @@ -56,9 +56,8 @@ def _is_databricks_event_log_filename(name: str) -> bool: def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: """Resolve ``path`` to an ordered list of files to scan. - Returns ``(source, files)`` where ``source`` is the original input - rendered as a string (preserved for the ``DetectionResult``) and - ``files`` is the scan order. + Returns ``(source, files)`` where ``source`` is the stripped string + form of the input and ``files`` is the scan order. """ source = path.no_scheme @@ -70,23 +69,20 @@ def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: f"Path is neither a file nor a directory: {source}" ) - # Directory: must be a Databricks rolling dir. Spark-native rolling - # (eventlog_v2_*) and generic multi-app directories are out of scope. + # Only Databricks-style rolling directories are supported here; + # Spark-native (eventlog_v2_*) and multi-app directories are not. children = CspFs.list_all_files(path) db_files = [c for c in children if _is_databricks_event_log_filename(c.base_name())] if not db_files: raise UnsupportedInputError( - f"Directory {source} is not a supported input shape. The detector " - "handles single files or Databricks rolling directories only; fall " - "back to the full pipeline for Spark-native rolling, multi-app " - "directories, wildcards, or comma-separated inputs." + f"Directory {source} is not a supported input shape. Only single " + "files and Databricks rolling directories are handled here; use " + "the full pipeline for other shapes." ) - # Sort mirroring DatabricksRollingEventLogFilesFileReader: dated files - # ascending by parsed datetime, bare `eventlog` last (treated as - # "latest/current"). Stable sort on filename first to keep ordering - # deterministic among equal-date files (extremely unlikely in practice - # but cheap insurance for tests). + # Dated files ascend by embedded timestamp; bare `eventlog` sorts + # last, matching DatabricksRollingEventLogFilesFileReader. The first + # sort keeps equal-date files in a deterministic order. db_files.sort(key=lambda f: f.base_name()) db_files.sort( key=lambda f: ( diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py index 80c07a2d7..e14102e81 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py @@ -14,10 +14,9 @@ """Bounded streaming event scanner. -Walks JSON-per-line event logs under a shared event budget, merges -properties from startup and per-SQL events into a single mutable dict, -and reports early-stop/exhausted/cap-hit termination. Strictly streaming: -no slurping, no accumulating raw events. +Walks JSON-per-line event logs under a shared event budget, folding the +relevant startup and per-SQL properties into a single mutable dict so +the classifier can decide as soon as a decisive signal is seen. """ import json @@ -51,9 +50,9 @@ def _scan_events( ) -> _ScanResult: """Scan one stream of lines, optionally continuing from a prior state. - Returns the updated ``_ScanResult``. Terminates as soon as classification - turns non-SPARK (``DECISIVE``), or when the budget is exhausted - (``CAP_HIT``), or when ``lines`` is fully consumed (``EXHAUSTED``). + Terminates as ``DECISIVE`` on the first non-SPARK classification, + ``CAP_HIT`` when ``budget`` is exhausted, or ``EXHAUSTED`` when the + iterator runs out. """ result = state if state is not None else _ScanResult() @@ -68,8 +67,8 @@ def _scan_events( try: event = json.loads(raw) except (json.JSONDecodeError, ValueError): - # Malformed JSON line (Spark tolerates trailing partials). Count - # it against the budget so a pathological log can't stall us. + # Tolerate trailing partial lines in live logs; count them so + # a pathological log can't keep us scanning forever. result.events_scanned += 1 continue @@ -108,7 +107,6 @@ def _scan_events( result.termination = Termination.DECISIVE return result - # Fully consumed without early-stop or budget exhaustion. result.termination = Termination.EXHAUSTED return result @@ -125,6 +123,5 @@ def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: state = _scan_events(lines, budget=budget, state=state) if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): return state - # All files consumed. state.termination = Termination.EXHAUSTED return state diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index f400a058c..9f4db706e 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -14,17 +14,14 @@ """Codec-aware context-managed line streamer for Spark event logs. -Opens the file via ``CspPath.open_input_stream()``, applies the right -decompression layer based on extension, wraps in a text decoder, and -yields an ``Iterator[str]``. On exit the context manager closes every -layer in reverse order. Streaming only — no buffering of the full file. - -PyArrow coupling: ``CspPath.open_input_stream()`` delegates to PyArrow's -filesystem API, which auto-detects and decompresses ``.gz`` and ``.zst`` -files transparently. ``.zstd`` is not recognised by PyArrow, so this -module decompresses it manually via ``zstandard``. If a future PyArrow -release changes its codec detection, this suffix mapping must be -re-verified. +Opens the file through ``CspPath.open_input_stream()``, wraps it with +the right decompression and text layers, and yields an +``Iterator[str]``. Streaming only — the full file is never buffered. + +``CspPath.open_input_stream()`` delegates to PyArrow, which auto-detects +and decompresses ``.gz`` and ``.zst`` transparently. ``.zstd`` is not +recognised by PyArrow, so we decompress it manually via ``zstandard``. +Revisit this mapping if the upstream codec detection changes. """ import contextlib @@ -40,19 +37,14 @@ ) -# PyArrow's ``open_input_stream()`` transparently decompresses files whose -# extension matches a codec it recognises (including ``.gz`` and ``.zst``). -# For these we can read the already-decompressed byte stream directly. +# Suffixes PyArrow already decompresses for us. _PYARROW_AUTO_DECOMP_SUFFIXES = {".gz", ".zst"} -# PyArrow does NOT recognise ``.zstd`` as a codec suffix, so the byte stream -# is raw compressed data that we must decompress ourselves. +# Suffixes we must decompress manually via zstandard. _ZSTD_MANUAL_SUFFIXES = {".zstd"} -# Suffixes treated as plain text (no decompression needed, no scheme check). +# Suffixes treated as plain text. _PLAIN_SUFFIXES = {"", ".inprogress"} -# Full whitelist of suffixes the detector accepts. Anything else raises -# ``UnsupportedCompressionError`` — including explicitly-bad codecs like -# ``.lz4`` / ``.lzf`` / ``.snappy`` and any unknown suffix we might -# otherwise fall through as plain text. +# Whitelist of accepted suffixes; anything else raises +# ``UnsupportedCompressionError``. _SUPPORTED_SUFFIXES = ( _PYARROW_AUTO_DECOMP_SUFFIXES | _ZSTD_MANUAL_SUFFIXES | _PLAIN_SUFFIXES ) @@ -71,10 +63,8 @@ def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: suffix = _classify_suffix(path) if suffix not in _SUPPORTED_SUFFIXES: raise UnsupportedCompressionError( - f"File suffix '{suffix}' is not supported by the lightweight " - "event log detector. Supported: plain, .inprogress, .gz, " - ".zstd, .zst. Fall back to the full qualification/profiling " - "pipeline for this log." + f"File suffix '{suffix}' is not supported. " + "Supported: plain, .inprogress, .gz, .zstd, .zst." ) try: @@ -86,30 +76,26 @@ def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: close_stack.callback(byte_stream.close) try: if suffix in _ZSTD_MANUAL_SUFFIXES: - # PyArrow does not recognise ``.zstd``, so the byte stream holds - # raw compressed frames — decompress them with the zstandard library. + # Decompress ``.zstd`` ourselves; PyArrow does not handle it. dctx = zstd.ZstdDecompressor() decompressed: io.RawIOBase = dctx.stream_reader(byte_stream) close_stack.callback(decompressed.close) else: - # Plain or PyArrow auto-decompressed (.gz/.zst/.inprogress/no - # extension). Pass the byte stream straight through — PyArrow - # has already handled decompression where needed. + # Plain text, or already decompressed by PyArrow. decompressed = byte_stream text = io.TextIOWrapper(decompressed, encoding="utf-8", errors="replace", newline="") close_stack.callback(text.close) def line_iter() -> Iterator[str]: + # One event per line; strip the trailing newline and leave + # empty lines for the caller to skip. for raw in text: - # Strip the trailing newline to match the "one event per line" - # contract. Empty lines are legal and skipped by the caller. yield raw.rstrip("\r\n") try: yield line_iter() except Exception as exc: - # Convert any read-time I/O error into a typed domain error. raise EventLogReadError(f"Error reading event log {path}: {exc}") from exc finally: close_stack.close() diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py index e6befad17..818249fa5 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py @@ -30,8 +30,9 @@ class Route(str, Enum): class SparkRuntime(str, Enum): """Runtime taxonomy. - Values match ``org.apache.spark.sql.rapids.tool.util.SparkRuntime`` in the - Scala core so aether's ``JobRun.spark_runtime`` column stays compatible. + Values mirror ``org.apache.spark.sql.rapids.tool.util.SparkRuntime`` + in the Scala core so string comparisons against existing pipelines + keep working. """ SPARK = "SPARK" @@ -41,7 +42,7 @@ class SparkRuntime(str, Enum): class Termination(Enum): - """How the scanner stopped. Used internally by the detector flow.""" + """How the scanner stopped.""" DECISIVE = "DECISIVE" # classification returned non-SPARK EXHAUSTED = "EXHAUSTED" # walked every file to EOF under the budget @@ -50,10 +51,10 @@ class Termination(Enum): @dataclass(frozen=True) class DetectionResult: - """Result returned by ``detect_spark_runtime``. + """Result returned by :func:`detect_spark_runtime`. - ``spark_runtime`` is best-effort metadata. ``None`` is valid (e.g., when - ``route`` is ``UNKNOWN`` because env-update was never seen). + ``spark_runtime`` is best-effort metadata and may be ``None`` when + ``route`` is ``UNKNOWN``. """ route: Route @@ -70,12 +71,12 @@ class EventLogDetectionError(Exception): class UnsupportedInputError(EventLogDetectionError): - """Input shape is outside V1 scope (multi-app dir, wildcard, comma list, ...).""" + """Input shape is not handled (multi-app dir, wildcard, comma list, ...).""" class UnsupportedCompressionError(EventLogDetectionError): - """File uses a compression codec the V1 detector does not handle.""" + """File uses a compression codec the detector does not handle.""" class EventLogReadError(EventLogDetectionError): - """Wraps an underlying I/O failure when reading the event log.""" + """Wraps an I/O failure while reading the event log.""" From be0ac3d02582b29f9dc222d0e5d934b7576db46e Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Wed, 22 Apr 2026 19:43:47 -0700 Subject: [PATCH 26/29] fix(eventlog_detector): address review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - resolver: invalid date components (e.g. month=13) in a Databricks rolled filename now return None instead of letting ValueError escape as a non-domain exception. The bad name sorts last alongside bare `eventlog`. - stream: narrow the context-manager's yield-time exception handler to OSError so caller-side logic errors propagate untouched instead of being reclassified as EventLogReadError. - classifier: compile the Auron extension regex with re.DOTALL so a newline-separated spark.sql.extensions value still matches. The fourth Greptile finding (Auron enabled-flag should use _parse_bool) is intentionally not applied: Scala's AuronParseHelper.isAuronTurnedOn uses equalsIgnoreCase("true") — strict string equality — not Try { toBoolean }.getOrElse(true). Keeping the Python check strict preserves Scala parity. Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/classifier.py | 6 +++++- .../tools/eventlog_detector/resolver.py | 7 ++++++- .../tools/eventlog_detector/stream.py | 4 +++- .../tools/eventlog_detector/test_classifier.py | 11 +++++++++++ .../tools/eventlog_detector/test_resolver.py | 6 ++++++ .../tools/eventlog_detector/test_stream.py | 10 ++++++++++ 6 files changed, 41 insertions(+), 3 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py index c0745650b..b1fe23963 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -53,7 +53,11 @@ def _is_spark_rapids(props: Mapping[str, str]) -> bool: def _is_auron(props: Mapping[str, str]) -> bool: extensions = props.get(m.AURON_SPARK_EXTENSIONS_KEY) - if extensions is None or not re.fullmatch(m.AURON_EXTENSION_REGEX, extensions): + # ``re.DOTALL`` so the ``.*`` anchors also match newlines, in case a + # multi-entry ``spark.sql.extensions`` is newline-separated. + if extensions is None or not re.fullmatch( + m.AURON_EXTENSION_REGEX, extensions, re.DOTALL + ): return False enabled_raw = props.get(m.AURON_ENABLED_KEY, m.AURON_ENABLED_DEFAULT) return enabled_raw.strip().lower() == m.AURON_ENABLED_DEFAULT diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py index 2d72f8ff8..8231a9bfb 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py @@ -46,7 +46,12 @@ def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: if match is None: return None year, month, day, hour, minute = (int(g) for g in match.groups()) - return datetime(year, month, day, hour, minute) + try: + return datetime(year, month, day, hour, minute) + except ValueError: + # Components are syntactically 2 digits but out of range + # (e.g. month=13). Treat as unparseable — sorts last. + return None def _is_databricks_event_log_filename(name: str) -> bool: diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index 9f4db706e..eaac96d3c 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -95,7 +95,9 @@ def line_iter() -> Iterator[str]: try: yield line_iter() - except Exception as exc: + except OSError as exc: + # Only reclassify real I/O failures; let caller-side logic + # errors bubble up untouched. raise EventLogReadError(f"Error reading event log {path}: {exc}") from exc finally: close_stack.close() diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index ac4e7ab05..d2f826f2d 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -78,6 +78,17 @@ def test_extension_and_default_enabled(self): props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} assert _classify_runtime(props) is SparkRuntime.AURON + def test_extension_with_embedded_newlines(self): + # Some pipelines concatenate spark.sql.extensions with newlines; + # the regex must still match. + props = { + "spark.sql.extensions": ( + "com.example.SomeOtherExtension\n" + "com.bytedance.auron.AuronSparkSessionExtension" + ), + } + assert _classify_runtime(props) is SparkRuntime.AURON + def test_extension_and_enabled_false_demotes_to_spark(self): props = { "spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension", diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py index 41d665fe4..8479adda6 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -46,6 +46,12 @@ def test_dated_file_without_codec_parses(self): def test_non_eventlog_prefix_returns_none(self): assert _parse_databricks_file_datetime("application_1234.log") is None + def test_out_of_range_components_return_none(self): + # month=13 is syntactically 2 digits but not a valid month; + # datetime() would raise ValueError, we want None instead. + assert _parse_databricks_file_datetime("eventlog-2021-13-01--00-00") is None + assert _parse_databricks_file_datetime("eventlog-2021-02-30--25-99") is None + class TestResolveSingleFile: """Test resolving a single event log file.""" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py index dbcaf94b2..098c611e3 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -146,3 +146,13 @@ def test_missing_file_raises_read_error(self, tmp_path): with pytest.raises(EventLogReadError): with _open_event_log_stream(CspPath(str(p))) as lines: next(iter(lines)) + + def test_caller_side_exception_is_not_reclassified(self, plain_file): # pylint: disable=redefined-outer-name + # Caller-raised exceptions must propagate untouched, not be + # reclassified as EventLogReadError. + class _MarkerError(RuntimeError): + pass + + with pytest.raises(_MarkerError): + with _open_event_log_stream(plain_file): + raise _MarkerError("not an I/O failure") From 3b140f1c7fb76db3994a4ae5162ac88fb2141eb2 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Fri, 24 Apr 2026 16:35:32 -0700 Subject: [PATCH 27/29] feat: streamline event log runtime detection Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/README.md | 57 ++++++ .../tools/eventlog_detector/__init__.py | 10 +- .../tools/eventlog_detector/classifier.py | 58 +++---- .../tools/eventlog_detector/detector.py | 51 +++--- .../tools/eventlog_detector/markers.py | 45 ++--- .../tools/eventlog_detector/resolver.py | 72 +++----- .../tools/eventlog_detector/scanner.py | 45 ++++- .../tools/eventlog_detector/stream.py | 3 +- .../tools/eventlog_detector/types.py | 11 +- .../eventlog_detector/test_classifier.py | 111 +++--------- .../tools/eventlog_detector/test_detector.py | 162 ++++++++++++------ .../test_detector_fixtures.py | 18 +- .../tools/eventlog_detector/test_markers.py | 73 +------- .../tools/eventlog_detector/test_resolver.py | 81 ++++----- .../tools/eventlog_detector/test_scanner.py | 101 +++++++++-- .../tools/eventlog_detector/test_types.py | 45 ++--- 16 files changed, 489 insertions(+), 454 deletions(-) create mode 100644 user_tools/src/spark_rapids_tools/tools/eventlog_detector/README.md diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/README.md b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/README.md new file mode 100644 index 000000000..3a17b61c3 --- /dev/null +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/README.md @@ -0,0 +1,57 @@ +# Event Log Runtime Detector + +This package provides a lightweight Python detector for deciding which full +tools flow should handle a single Spark application event log. + +The detector is an early routing check. It scans a bounded prefix of an event +log, stops as soon as it has enough information, and returns one of: + +- `PROFILING`: a RAPIDS runtime signal was found. +- `QUALIFICATION`: startup properties indicate standard OSS Spark with no + RAPIDS markers. +- `UNKNOWN`: the scan did not reach enough information within the event budget. + +## Detection Flow + +1. `resolver.py` resolves the input into ordered event-log files. + Supported inputs are a single event-log file or an Apache Spark rolling + event-log directory using the `eventlog_v2_*` / `events_*` layout. +2. `stream.py` opens each file through `CspPath.open_input_stream()` and yields + one decoded event-log line at a time. The full log is not loaded into memory. +3. `scanner.py` parses events until a decision is available or the + `max_events_scanned` budget is reached. +4. `classifier.py` classifies the accumulated Spark properties as `SPARK` or + `SPARK_RAPIDS`. +5. `detector.py` maps the scan result to `ToolExecution`. + +## RAPIDS Detection + +RAPIDS logs are detected from either of these signals: + +- `SparkRapidsBuildInfoEvent`, emitted by RAPIDS plugin event logs. +- Spark properties showing `spark.plugins` contains + `com.nvidia.spark.SQLPlugin` and `spark.rapids.sql.enabled` is not `false`. + +The `spark.rapids.sql.enabled` parse matches the Scala tools behavior: +missing or unparseable values default to `true`. + +## CPU Fast Path + +When `SparkListenerEnvironmentUpdate` is reached and startup Spark properties +contain no RAPIDS-related configuration, the detector can return +`QUALIFICATION` immediately. This applies to both single-file and OSS rolling +event logs. + +If RAPIDS-related configuration is present but not decisive, the scanner keeps +reading within the configured event budget. This avoids treating a log as CPU +when later `modifiedConfigs` may make the RAPIDS configuration active. + +## Streaming And Memory + +The detector streams one line at a time. Memory is bounded to: + +- a small set of runtime metadata fields, +- the accumulated Spark properties map, +- the current decoded event record. + +It does not retain raw events or read entire event-log files into memory. diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py index c8a3ab1d0..014a8ed64 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/__init__.py @@ -14,9 +14,9 @@ """Lightweight event log runtime detector. -Scans a bounded prefix of a Spark event log and returns a routing -decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) plus -best-effort runtime metadata, without invoking the full tools pipeline. +Scans a bounded prefix of a Spark event log and returns a tool execution +decision (``QUALIFICATION`` / ``PROFILING`` / ``UNKNOWN``) plus best-effort +runtime metadata, without invoking the full tools pipeline. Public entry point: :func:`detect_spark_runtime`. """ @@ -26,8 +26,8 @@ DetectionResult, EventLogDetectionError, EventLogReadError, - Route, SparkRuntime, + ToolExecution, UnsupportedCompressionError, UnsupportedInputError, ) @@ -36,8 +36,8 @@ "DetectionResult", "EventLogDetectionError", "EventLogReadError", - "Route", "SparkRuntime", + "ToolExecution", "UnsupportedCompressionError", "UnsupportedInputError", "detect_spark_runtime", diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py index b1fe23963..4751dbb1c 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Runtime classifier. +"""Classify Spark runtime from accumulated Spark properties. -Maps a merged Spark properties dict to a :class:`SparkRuntime`. When -multiple markers are present the priority is -``PHOTON > AURON > SPARK_RAPIDS > SPARK``. +The scanner extracts Spark properties from event-log records and passes the +merged map to this module. This module only answers whether those properties +represent standard Spark or a RAPIDS-enabled application. """ -import re from typing import Mapping from spark_rapids_tools.tools.eventlog_detector import markers as m @@ -27,11 +26,12 @@ def _parse_bool(raw: str, default: bool) -> bool: - """Parse ``"true"``/``"false"`` case-insensitively; anything else returns ``default``. + """Parse Spark boolean strings with Scala-compatible fallback behavior. - Matches Scala's ``Try { s.toBoolean }.getOrElse(default)`` — Scala's - ``String.toBoolean`` only accepts the two literals and throws on - everything else. + Scala's ``String.toBoolean`` accepts only ``true`` and ``false``. The + Scala tools wrap that parse in ``Try(...).getOrElse(default)``, so values + such as ``yes``, ``1``, or an empty string must return ``default`` rather + than using Python truthiness. """ stripped = raw.strip().lower() if stripped == "true": @@ -42,6 +42,7 @@ def _parse_bool(raw: str, default: bool) -> bool: def _is_spark_rapids(props: Mapping[str, str]) -> bool: + """Return true when Spark properties show the RAPIDS SQL plugin is active.""" plugins = props.get(m.GPU_PLUGIN_KEY, "") if m.GPU_PLUGIN_CLASS_SUBSTRING not in plugins: return False @@ -51,38 +52,21 @@ def _is_spark_rapids(props: Mapping[str, str]) -> bool: return _parse_bool(raw, default=m.GPU_ENABLED_DEFAULT) -def _is_auron(props: Mapping[str, str]) -> bool: - extensions = props.get(m.AURON_SPARK_EXTENSIONS_KEY) - # ``re.DOTALL`` so the ``.*`` anchors also match newlines, in case a - # multi-entry ``spark.sql.extensions`` is newline-separated. - if extensions is None or not re.fullmatch( - m.AURON_EXTENSION_REGEX, extensions, re.DOTALL - ): - return False - enabled_raw = props.get(m.AURON_ENABLED_KEY, m.AURON_ENABLED_DEFAULT) - return enabled_raw.strip().lower() == m.AURON_ENABLED_DEFAULT - - -def _is_databricks(props: Mapping[str, str]) -> bool: - return all(props.get(k, "").strip() for k in m.DB_PRECONDITION_KEYS) - +def _has_rapids_conf_markers(props: Mapping[str, str]) -> bool: + """Return true when properties contain any RAPIDS-related configuration. -def _is_photon(props: Mapping[str, str]) -> bool: - if not _is_databricks(props): - return False - for key, pattern in m.PHOTON_MARKER_REGEX.items(): - value = props.get(key) - if value is not None and re.fullmatch(pattern, value): - return True - return False + This is intentionally broader than ``_is_spark_rapids``. A disabled or + incomplete RAPIDS configuration is not classified as RAPIDS, but its + presence should prevent early CPU routing because later events may update + the effective configuration. + """ + if m.GPU_PLUGIN_CLASS_SUBSTRING in props.get(m.GPU_PLUGIN_KEY, ""): + return True + return m.GPU_ENABLED_KEY in props def _classify_runtime(props: Mapping[str, str]) -> SparkRuntime: - # Priority: PHOTON > AURON > SPARK_RAPIDS > SPARK. - if _is_photon(props): - return SparkRuntime.PHOTON - if _is_auron(props): - return SparkRuntime.AURON + """Classify accumulated Spark properties into the supported runtime enum.""" if _is_spark_rapids(props): return SparkRuntime.SPARK_RAPIDS return SparkRuntime.SPARK diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py index 51cd4294e..aff1fd4f0 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py @@ -22,31 +22,31 @@ from spark_rapids_tools.tools.eventlog_detector.scanner import _scan_events_across from spark_rapids_tools.tools.eventlog_detector.types import ( DetectionResult, - Route, SparkRuntime, Termination, + ToolExecution, ) -_GPU_FAMILY = frozenset({SparkRuntime.SPARK_RAPIDS, SparkRuntime.PHOTON, SparkRuntime.AURON}) - - def detect_spark_runtime( event_log: Union[str, CspPath], *, max_events_scanned: int = 500, + allow_cpu_fast_path: bool = True, ) -> DetectionResult: - """Classify a single-app event log into a routing decision. + """Classify a single-app event log into a tool execution decision. - Returns a :class:`DetectionResult` whose ``route`` is ``PROFILING`` on - a decisive non-SPARK classification, ``QUALIFICATION`` only after the - scanner walked the full log without seeing a GPU-family signal, and - ``UNKNOWN`` otherwise (e.g., the budget was hit first or the log never - emitted ``SparkListenerEnvironmentUpdate``). + Returns ``PROFILING`` when a RAPIDS marker is found, ``QUALIFICATION`` when + the log appears to be OSS Spark/CPU, and ``UNKNOWN`` when the bounded scan + cannot make a decision. - ``max_events_scanned`` caps CPU/IO cost. Large CPU logs routinely end - as ``UNKNOWN`` at the cap; raise it at the call site to trade cost - for decisiveness. + ``max_events_scanned`` caps CPU/IO cost. Logs that do not expose a RAPIDS + marker or ``SparkListenerEnvironmentUpdate`` within the cap remain + ``UNKNOWN``. + + ``allow_cpu_fast_path`` enables early CPU routing when startup properties + contain no RAPIDS markers. Disable it to require EOF before returning + ``QUALIFICATION``. """ # Keep the caller's input verbatim in source_path (cloud URI schemes # would otherwise be stripped by CspPath normalisation). @@ -54,22 +54,31 @@ def detect_spark_runtime( path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) _, files = _resolve_event_log_files(path) - scan = _scan_events_across(files, budget=max_events_scanned) + scan = _scan_events_across( + files, + budget=max_events_scanned, + allow_cpu_fast_path=allow_cpu_fast_path, + ) runtime: Optional[SparkRuntime] - if scan.env_update_seen: + if scan.rapids_build_info_seen: + runtime = SparkRuntime.SPARK_RAPIDS + elif scan.env_update_seen: runtime = _classify_runtime(scan.spark_properties) else: runtime = None - if runtime in _GPU_FAMILY: - route = Route.PROFILING + if runtime is SparkRuntime.SPARK_RAPIDS: + tool_execution = ToolExecution.PROFILING reason = f"decisive: classified as {runtime.value}" + elif scan.termination is Termination.CPU_FAST_PATH and runtime is SparkRuntime.SPARK: + tool_execution = ToolExecution.QUALIFICATION + reason = "startup properties classify as SPARK with no RAPIDS markers" elif scan.termination is Termination.EXHAUSTED and scan.env_update_seen: - route = Route.QUALIFICATION - reason = "walked full log, no GPU-family signal" + tool_execution = ToolExecution.QUALIFICATION + reason = "walked full log, no RAPIDS signal" else: - route = Route.UNKNOWN + tool_execution = ToolExecution.UNKNOWN reason = ( "no decisive signal within bounded scan" if scan.env_update_seen @@ -78,7 +87,7 @@ def detect_spark_runtime( resolved_path = scan.last_scanned_path or (str(files[0]) if files else source_path) return DetectionResult( - route=route, + tool_execution=tool_execution, spark_runtime=runtime, app_id=scan.app_id, spark_version=scan.spark_version, diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py index 80e0f8a58..176e73a68 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/markers.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Property keys, regexes, and constants used by the runtime classifier. +"""Property keys and constants used by the runtime detector. Each block carries a Scala source reference so the two implementations can be kept in sync when the Scala detection rules change. """ -from typing import Mapping, Tuple - # GPU (SPARK_RAPIDS) markers. # Scala: org/apache/spark/sql/rapids/tool/ToolUtils.scala :: isPluginEnabled GPU_PLUGIN_KEY: str = "spark.plugins" @@ -28,43 +26,20 @@ # Defaults to true when missing or unparseable. GPU_ENABLED_DEFAULT: bool = True -# AURON markers. -# Scala: com/nvidia/spark/rapids/tool/planparser/auron/AuronParseHelper.scala -AURON_SPARK_EXTENSIONS_KEY: str = "spark.sql.extensions" -AURON_EXTENSION_REGEX: str = r".*AuronSparkSessionExtension.*" -AURON_ENABLED_KEY: str = "spark.auron.enabled" -AURON_ENABLED_DEFAULT: str = "true" - -# Databricks precondition — all three keys must be non-empty. -# Scala: com/nvidia/spark/rapids/tool/planparser/db/DBPlugin.scala :: DBConditionImpl -DB_PRECONDITION_KEYS: Tuple[str, str, str] = ( - "spark.databricks.clusterUsageTags.clusterAllTags", - "spark.databricks.clusterUsageTags.clusterId", - "spark.databricks.clusterUsageTags.clusterName", -) - -# Photon markers — any one fullmatches once the Databricks precondition holds. -# Scala: com/nvidia/spark/rapids/tool/planparser/db/DatabricksParseHelper.scala :: PhotonParseHelper -PHOTON_MARKER_REGEX: Mapping[str, str] = { - "spark.databricks.clusterUsageTags.sparkVersion": r".*-photon-.*", - "spark.databricks.clusterUsageTags.effectiveSparkVersion": r".*-photon-.*", - "spark.databricks.clusterUsageTags.sparkImageLabel": r".*-photon-.*", - "spark.databricks.clusterUsageTags.runtimeEngine": r"PHOTON", -} +# RAPIDS 24.06+ plugin marker. +# Scala: com/nvidia/spark/rapids/SparkRapidsBuildInfoEvent.scala +EVENT_SPARK_RAPIDS_BUILD_INFO: str = "com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent" +EVENT_SPARK_RAPIDS_BUILD_INFO_SHORTNAME: str = "SparkRapidsBuildInfoEvent" -# Databricks rolling event-log file layout. -# Scala: com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala :: getDBEventLogFileDate -DB_EVENT_LOG_FILE_PREFIX: str = "eventlog" -# ``eventlog-YYYY-MM-DD--HH-MM[.codec]``. Bare ``eventlog`` does not match -# and is treated as the latest chunk (sorted last) by the resolver. -DB_EVENT_LOG_DATE_REGEX: str = ( - r"^eventlog-(\d{4})-(\d{2})-(\d{2})--(\d{2})-(\d{2})(?:\.[A-Za-z0-9]+)?$" -) +# Apache Spark rolling event-log directory layout. +# Scala: com/nvidia/spark/rapids/tool/EventLogPathProcessor.scala :: isEventLogDir +OSS_EVENT_LOG_DIR_PREFIX: str = "eventlog_v2_" +OSS_EVENT_LOG_FILE_PREFIX: str = "events_" # Spark listener event names consumed by the scanner. EVENT_LOG_START: str = "SparkListenerLogStart" EVENT_APPLICATION_START: str = "SparkListenerApplicationStart" EVENT_ENVIRONMENT_UPDATE: str = "SparkListenerEnvironmentUpdate" EVENT_SQL_EXECUTION_START: str = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" -# Unqualified shortname; sometimes used in test fixtures. +# Unqualified event name accepted by the scanner for compatibility. EVENT_SQL_EXECUTION_START_SHORTNAME: str = "SparkListenerSQLExecutionStart" diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py index 8231a9bfb..1ea632c56 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py @@ -12,16 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Input-path resolver. - -Resolves a user-supplied path into an ordered list of concrete files to -scan. Supports a single file or a Databricks rolling directory; any -other shape raises :class:`UnsupportedInputError` so the caller can -fall back to the full tools pipeline. -""" +"""Input-path resolver for single files and Apache Spark rolling event logs.""" import re -from datetime import datetime from typing import List, Optional, Tuple from spark_rapids_tools.storagelib import CspFs, CspPath @@ -29,40 +22,32 @@ from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError -_DB_DATE_PATTERN = re.compile(m.DB_EVENT_LOG_DATE_REGEX) - +_OSS_EVENT_FILE_PATTERN = re.compile(r"^events_(\d+)_.*") -def _parse_databricks_file_datetime(name: str) -> Optional[datetime]: - """Parse a Databricks rolled filename to its embedded datetime. - Returns ``None`` for bare ``eventlog`` (the current/latest chunk) and - for any name that does not match the dated pattern; callers sort - ``None`` last to mirror Scala's ``getDBEventLogFileDate`` which - defaults the bare file to ``now()``. - """ - if not name.startswith(m.DB_EVENT_LOG_FILE_PREFIX): - return None - match = _DB_DATE_PATTERN.match(name) +def _parse_oss_event_file_index(name: str) -> Optional[int]: + """Return the numeric chunk index from ``events__...`` files.""" + match = _OSS_EVENT_FILE_PATTERN.match(name) if match is None: return None - year, month, day, hour, minute = (int(g) for g in match.groups()) - try: - return datetime(year, month, day, hour, minute) - except ValueError: - # Components are syntactically 2 digits but out of range - # (e.g. month=13). Treat as unparseable — sorts last. - return None + return int(match.group(1)) + +def _is_oss_event_log_file(path: CspPath) -> bool: + return _parse_oss_event_file_index(path.base_name()) is not None -def _is_databricks_event_log_filename(name: str) -> bool: - return name.startswith(m.DB_EVENT_LOG_FILE_PREFIX) + +def _base_name_from_source(source: str) -> str: + """Return the final path component, ignoring trailing separators.""" + return source.rstrip("/").rsplit("/", 1)[-1] def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: """Resolve ``path`` to an ordered list of files to scan. - Returns ``(source, files)`` where ``source`` is the stripped string - form of the input and ``files`` is the scan order. + Supported inputs are a single concrete file or an Apache Spark rolling + event-log directory named ``eventlog_v2_*``. Other directory layouts are + rejected so callers can use the full tools pipeline. """ source = path.no_scheme @@ -74,24 +59,21 @@ def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: f"Path is neither a file nor a directory: {source}" ) - # Only Databricks-style rolling directories are supported here; - # Spark-native (eventlog_v2_*) and multi-app directories are not. - children = CspFs.list_all_files(path) - db_files = [c for c in children if _is_databricks_event_log_filename(c.base_name())] - if not db_files: + if not _base_name_from_source(source).startswith(m.OSS_EVENT_LOG_DIR_PREFIX): raise UnsupportedInputError( f"Directory {source} is not a supported input shape. Only single " - "files and Databricks rolling directories are handled here; use " - "the full pipeline for other shapes." + "files and Apache Spark rolling event-log directories are handled " + "here; use the full pipeline for other shapes." ) - # Dated files ascend by embedded timestamp; bare `eventlog` sorts - # last, matching DatabricksRollingEventLogFilesFileReader. The first - # sort keeps equal-date files in a deterministic order. - db_files.sort(key=lambda f: f.base_name()) - db_files.sort( + event_files = [c for c in CspFs.list_all_files(path) if _is_oss_event_log_file(c)] + if not event_files: + raise UnsupportedInputError(f"Directory {source} does not contain Spark event chunks") + + event_files.sort( key=lambda f: ( - _parse_databricks_file_datetime(f.base_name()) or datetime.max, + _parse_oss_event_file_index(f.base_name()) or 0, + f.base_name(), ) ) - return source, db_files + return source, event_files diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py index e14102e81..4d15e3431 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py @@ -14,9 +14,8 @@ """Bounded streaming event scanner. -Walks JSON-per-line event logs under a shared event budget, folding the -relevant startup and per-SQL properties into a single mutable dict so -the classifier can decide as soon as a decisive signal is seen. +Walks JSON-per-line event logs under a shared event budget and accumulates the +startup and per-SQL properties required for runtime classification. """ import json @@ -25,7 +24,10 @@ from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector import markers as m -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.classifier import ( + _classify_runtime, + _has_rapids_conf_markers, +) from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, Termination @@ -37,6 +39,7 @@ class _ScanResult: app_name: Optional[str] = None spark_version: Optional[str] = None env_update_seen: bool = False + rapids_build_info_seen: bool = False events_scanned: int = 0 termination: Termination = Termination.EXHAUSTED last_scanned_path: Optional[str] = None @@ -46,6 +49,7 @@ def _scan_events( lines: Iterable[str], *, budget: int, + allow_cpu_fast_path: bool = True, state: Optional[_ScanResult] = None, ) -> _ScanResult: """Scan one stream of lines, optionally continuing from a prior state. @@ -74,6 +78,13 @@ def _scan_events( result.events_scanned += 1 name = event.get("Event") + if name in ( + m.EVENT_SPARK_RAPIDS_BUILD_INFO, + m.EVENT_SPARK_RAPIDS_BUILD_INFO_SHORTNAME, + ): + result.rapids_build_info_seen = True + result.termination = Termination.DECISIVE + return result if name == m.EVENT_LOG_START: version = event.get("Spark Version") if isinstance(version, str): @@ -92,9 +103,13 @@ def _scan_events( if isinstance(k, str) and isinstance(v, str): result.spark_properties[k] = v result.env_update_seen = True - if _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK: + runtime = _classify_runtime(result.spark_properties) + if runtime is not SparkRuntime.SPARK: result.termination = Termination.DECISIVE return result + if allow_cpu_fast_path and not _has_rapids_conf_markers(result.spark_properties): + result.termination = Termination.CPU_FAST_PATH + return result elif name in (m.EVENT_SQL_EXECUTION_START, m.EVENT_SQL_EXECUTION_START_SHORTNAME): modified = event.get("modifiedConfigs") or {} if isinstance(modified, dict) and modified: @@ -111,7 +126,12 @@ def _scan_events( return result -def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: +def _scan_events_across( + files: List[CspPath], + *, + budget: int, + allow_cpu_fast_path: bool = True, +) -> _ScanResult: """Walk ``files`` in order under a single shared ``budget``.""" state = _ScanResult() for path in files: @@ -120,8 +140,17 @@ def _scan_events_across(files: List[CspPath], *, budget: int) -> _ScanResult: return state state.last_scanned_path = str(path) with _open_event_log_stream(path) as lines: - state = _scan_events(lines, budget=budget, state=state) - if state.termination in (Termination.DECISIVE, Termination.CAP_HIT): + state = _scan_events( + lines, + budget=budget, + allow_cpu_fast_path=allow_cpu_fast_path, + state=state, + ) + if state.termination in ( + Termination.DECISIVE, + Termination.CPU_FAST_PATH, + Termination.CAP_HIT, + ): return state state.termination = Termination.EXHAUSTED return state diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index eaac96d3c..900f97150 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -20,8 +20,7 @@ ``CspPath.open_input_stream()`` delegates to PyArrow, which auto-detects and decompresses ``.gz`` and ``.zst`` transparently. ``.zstd`` is not -recognised by PyArrow, so we decompress it manually via ``zstandard``. -Revisit this mapping if the upstream codec detection changes. +recognised by PyArrow, so this module decompresses it via ``zstandard``. """ import contextlib diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py index 818249fa5..01d72eaf5 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/types.py @@ -19,8 +19,8 @@ from typing import Optional -class Route(str, Enum): - """Routing decision returned to the caller.""" +class ToolExecution(str, Enum): + """Tool execution decision returned to the caller.""" QUALIFICATION = "QUALIFICATION" PROFILING = "PROFILING" @@ -37,14 +37,13 @@ class SparkRuntime(str, Enum): SPARK = "SPARK" SPARK_RAPIDS = "SPARK_RAPIDS" - PHOTON = "PHOTON" - AURON = "AURON" class Termination(Enum): """How the scanner stopped.""" DECISIVE = "DECISIVE" # classification returned non-SPARK + CPU_FAST_PATH = "CPU_FAST_PATH" # stopped after plain-SPARK startup props EXHAUSTED = "EXHAUSTED" # walked every file to EOF under the budget CAP_HIT = "CAP_HIT" # hit max_events_scanned before exhausting files @@ -54,10 +53,10 @@ class DetectionResult: """Result returned by :func:`detect_spark_runtime`. ``spark_runtime`` is best-effort metadata and may be ``None`` when - ``route`` is ``UNKNOWN``. + ``tool_execution`` is ``UNKNOWN``. """ - route: Route + tool_execution: ToolExecution spark_runtime: Optional[SparkRuntime] app_id: Optional[str] spark_version: Optional[str] diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index d2f826f2d..8735c0985 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -17,7 +17,10 @@ import pytest -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime +from spark_rapids_tools.tools.eventlog_detector.classifier import ( + _classify_runtime, + _has_rapids_conf_markers, +) from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime @@ -28,6 +31,24 @@ def test_empty_props_is_spark(self): assert _classify_runtime({}) is SparkRuntime.SPARK +class TestRapidsConfigMarkers: + """Test marker presence checks used by the CPU fast path.""" + + def test_plain_spark_props_have_no_rapids_markers(self): + assert _has_rapids_conf_markers({"spark.master": "local"}) is False + + def test_disabled_rapids_plugin_still_counts_as_marker(self): + props = { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + } + assert _classify_runtime(props) is SparkRuntime.SPARK + assert _has_rapids_conf_markers(props) is True + + def test_rapids_enabled_key_alone_counts_as_marker(self): + assert _has_rapids_conf_markers({"spark.rapids.sql.enabled": "true"}) is True + + class TestSparkRapids: """Test SPARK_RAPIDS classification logic.""" @@ -69,91 +90,3 @@ def test_non_toboolean_values_default_to_true_matching_scala(self, bogus_value): "spark.rapids.sql.enabled": bogus_value, } assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS - - -class TestAuron: - """Test AURON classification logic.""" - - def test_extension_and_default_enabled(self): - props = {"spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension"} - assert _classify_runtime(props) is SparkRuntime.AURON - - def test_extension_with_embedded_newlines(self): - # Some pipelines concatenate spark.sql.extensions with newlines; - # the regex must still match. - props = { - "spark.sql.extensions": ( - "com.example.SomeOtherExtension\n" - "com.bytedance.auron.AuronSparkSessionExtension" - ), - } - assert _classify_runtime(props) is SparkRuntime.AURON - - def test_extension_and_enabled_false_demotes_to_spark(self): - props = { - "spark.sql.extensions": "com.bytedance.auron.AuronSparkSessionExtension", - "spark.auron.enabled": "FALSE", - } - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_auron_enabled_case_insensitive(self): - props = { - "spark.sql.extensions": "AuronSparkSessionExtension", - "spark.auron.enabled": " TrUe ", - } - assert _classify_runtime(props) is SparkRuntime.AURON - - -class TestDatabricksPhoton: - """Test Databricks PHOTON classification logic.""" - - @pytest.fixture - def db_precond_props(self): - return { - "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", - "spark.databricks.clusterUsageTags.clusterId": "1234", - "spark.databricks.clusterUsageTags.clusterName": "dev-cluster", - } - - def test_precondition_only_is_spark(self, db_precond_props): - assert _classify_runtime(db_precond_props) is SparkRuntime.SPARK - - def test_precondition_plus_photon_version(self, db_precond_props): - props = { - **db_precond_props, - "spark.databricks.clusterUsageTags.sparkVersion": "11.3.x-photon-scala2.12", - } - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_precondition_plus_photon_engine(self, db_precond_props): - props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_photon_marker_without_precondition_is_spark(self): - props = {"spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_photon_engine_other_value_is_spark(self, db_precond_props): - props = {**db_precond_props, "spark.databricks.clusterUsageTags.runtimeEngine": "STANDARD"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - -class TestPriority: - """PHOTON > AURON > SPARK_RAPIDS > SPARK when markers coexist.""" - - def test_photon_beats_spark_rapids(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.databricks.clusterUsageTags.clusterAllTags": "[{...}]", - "spark.databricks.clusterUsageTags.clusterId": "1", - "spark.databricks.clusterUsageTags.clusterName": "c", - "spark.databricks.clusterUsageTags.runtimeEngine": "PHOTON", - } - assert _classify_runtime(props) is SparkRuntime.PHOTON - - def test_auron_beats_spark_rapids(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.sql.extensions": "AuronSparkSessionExtension", - } - assert _classify_runtime(props) is SparkRuntime.AURON diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py index bafd07742..9867d9f22 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -23,8 +23,8 @@ from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime from spark_rapids_tools.tools.eventlog_detector.types import ( - Route, SparkRuntime, + ToolExecution, UnsupportedInputError, ) @@ -39,6 +39,29 @@ def env_update(props: dict) -> dict: } +def build_info() -> dict: + return { + "Event": "com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent", + "sparkRapidsBuildInfo": {"version": "24.06.0"}, + "sparkRapidsJniBuildInfo": {}, + "cudfBuildInfo": {}, + "sparkRapidsPrivateBuildInfo": {}, + } + + +def sql_exec_start(modified_configs: dict) -> dict: + return { + "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", + "executionId": 0, + "description": "", + "details": "", + "physicalPlanDescription": "", + "sparkPlanInfo": {}, + "time": 0, + "modifiedConfigs": modified_configs, + } + + def _write_plain_log(path: Path, events: list) -> None: path.write_text( "\n".join(json.dumps(e) for e in events) + "\n", encoding="utf-8" @@ -59,12 +82,26 @@ def test_str_input_resolves(self, tmp_path): ], ) result = detect_spark_runtime(str(log)) - assert result.route is Route.QUALIFICATION + assert result.tool_execution is ToolExecution.QUALIFICATION assert result.spark_runtime is SparkRuntime.SPARK -class TestGpuLog: - """Test detection on GPU event logs.""" +class TestRapidsLog: + """Test detection on RAPIDS event logs.""" + + def test_build_info_event_classifies_as_profiling(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + build_info(), + ], + ) + result = detect_spark_runtime(CspPath(str(log))) + assert result.tool_execution is ToolExecution.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS + assert result.spark_version == "3.5.1" def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): log = tmp_path / "eventlog" @@ -77,10 +114,52 @@ def test_env_update_with_plugin_classifies_as_profiling(self, tmp_path): ], ) result = detect_spark_runtime(CspPath(str(log))) - assert result.route is Route.PROFILING + assert result.tool_execution is ToolExecution.PROFILING assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS assert result.app_id == "g" - assert result.spark_version == "3.5.1" + + +class TestCpuFastPath: + """Test the default fast path for startup properties that look like plain Spark.""" + + def test_env_update_without_rapids_markers_returns_qualification(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, + {"Event": "SparkListenerApplicationStart", "App ID": "c", "App Name": "C"}, + env_update({"spark.master": "local"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert result.tool_execution is ToolExecution.QUALIFICATION + assert result.spark_runtime is SparkRuntime.SPARK + assert "startup properties" in result.reason.lower() + + def test_fast_path_can_be_disabled(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log(log, [env_update({"spark.master": "local"})]) + result = detect_spark_runtime(str(log), allow_cpu_fast_path=False) + assert result.tool_execution is ToolExecution.QUALIFICATION + assert "walked full log" in result.reason.lower() + + def test_fast_path_does_not_fire_when_rapids_marker_present(self, tmp_path): + log = tmp_path / "eventlog" + _write_plain_log( + log, + [ + env_update({ + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + }), + sql_exec_start({"spark.rapids.sql.enabled": "true"}), + ], + ) + result = detect_spark_runtime(str(log)) + assert result.tool_execution is ToolExecution.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS class TestCapHit: @@ -88,63 +167,55 @@ class TestCapHit: def test_no_env_update_before_cap_is_unknown(self, tmp_path): log = tmp_path / "eventlog" - # Many LogStart events, no env-update. Cap hits first. _write_plain_log( log, [{"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}] * 10, ) result = detect_spark_runtime(str(log), max_events_scanned=5) - assert result.route is Route.UNKNOWN + assert result.tool_execution is ToolExecution.UNKNOWN assert result.spark_runtime is None - reason = result.reason.lower() - assert "no decisive signal" in reason or "no sparklistenerenvironmentupdate" in reason -class TestDatabricksRolling: - """Test detection on Databricks rolling event log directories.""" +class TestOssRolling: + """Test detection on Apache Spark rolling event-log directories.""" - def test_gpu_in_later_rolled_file(self, tmp_path): - d = tmp_path / "dbrolling" + def test_rapids_signal_in_later_rolled_file(self, tmp_path): + d = tmp_path / "eventlog_v2_app-1" d.mkdir() _write_plain_log( - d / "eventlog-2021-06-14--18-00", + d / "events_1_app-1", [ {"Event": "SparkListenerLogStart", "Spark Version": "3.5.1"}, {"Event": "SparkListenerApplicationStart", "App ID": "d", "App Name": "D"}, - env_update({"spark.master": "local"}), - ], - ) - _write_plain_log( - d / "eventlog", - [ - { - "Event": "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart", - "executionId": 0, - "description": "", - "details": "", - "physicalPlanDescription": "", - "sparkPlanInfo": {}, - "time": 0, - "modifiedConfigs": {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, - } + env_update({"spark.rapids.sql.enabled": "false"}), ], ) + _write_plain_log(d / "events_2_app-1", [build_info()]) result = detect_spark_runtime(CspPath(str(d))) - assert result.route is Route.PROFILING + assert result.tool_execution is ToolExecution.PROFILING assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS - # The decisive signal came from the bare `eventlog` file (the latest, - # sorted last after the dated chunk). event_log_path should reflect - # that, not the first-opened file. - assert result.event_log_path.endswith("/eventlog") + assert result.event_log_path.endswith("/events_2_app-1") + + def test_cpu_fast_path_applies_to_rolling_dir(self, tmp_path): + d = tmp_path / "eventlog_v2_app-1" + d.mkdir() + _write_plain_log(d / "events_1_app-1", [env_update({"spark.master": "local"})]) + _write_plain_log(d / "events_2_app-1", [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})]) + result = detect_spark_runtime(CspPath(str(d))) + assert result.tool_execution is ToolExecution.QUALIFICATION + assert result.spark_runtime is SparkRuntime.SPARK + assert result.event_log_path.endswith("/events_1_app-1") + assert "startup properties" in result.reason.lower() class TestUnsupportedInput: """Test that unsupported input shapes raise the expected error.""" - def test_spark_native_rolling_dir_raises(self, tmp_path): - d = tmp_path / "eventlog_v2_local-1623876083964" + def test_non_oss_rolling_dir_raises(self, tmp_path): + d = tmp_path / "non_oss_rolling" d.mkdir() - (d / "events_1_local-1623876083964").write_bytes(b"") + (d / "eventlog").write_bytes(b"") + (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") with pytest.raises(UnsupportedInputError): detect_spark_runtime(CspPath(str(d))) @@ -154,20 +225,15 @@ class TestReasonStrings: def test_reason_mentions_runtime_on_profiling(self, tmp_path): log = tmp_path / "eventlog" - _write_plain_log( - log, - [ - env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), - ], - ) + _write_plain_log(log, [build_info()]) result = detect_spark_runtime(str(log)) assert "SPARK_RAPIDS" in result.reason - def test_reason_mentions_full_log_on_qualification(self, tmp_path): + def test_reason_mentions_full_log_on_strict_qualification(self, tmp_path): log = tmp_path / "eventlog" _write_plain_log(log, [env_update({"spark.master": "local"})]) - result = detect_spark_runtime(str(log)) - assert result.route is Route.QUALIFICATION + result = detect_spark_runtime(str(log), allow_cpu_fast_path=False) + assert result.tool_execution is ToolExecution.QUALIFICATION assert "walked full log" in result.reason.lower() diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py index c598bb54b..c36814b0f 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py @@ -15,7 +15,7 @@ """Anchor tests against fixtures already shipped in the Scala core. These are not a full parity sweep. They catch regressions on a small -curated set covering each decisive route. +curated set covering each decisive execution decision. """ from pathlib import Path @@ -23,7 +23,7 @@ import pytest from spark_rapids_tools.tools.eventlog_detector import detect_spark_runtime -from spark_rapids_tools.tools.eventlog_detector.types import Route, SparkRuntime +from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, ToolExecution REPO_ROOT = Path(__file__).resolve().parents[5] @@ -31,32 +31,32 @@ @pytest.mark.parametrize( - "relative_path,expected_route,expected_runtime", + "relative_path,expected_execution,expected_runtime", [ ( "spark-events-profiling/eventlog-gpu-dsv2.zstd", - Route.PROFILING, + ToolExecution.PROFILING, SparkRuntime.SPARK_RAPIDS, ), ( "spark-events-profiling/eventlog_dsv2.zstd", - Route.QUALIFICATION, + ToolExecution.QUALIFICATION, SparkRuntime.SPARK, ), ( "spark-events-qualification/eventlog_same_app_id_1.zstd", - Route.QUALIFICATION, + ToolExecution.QUALIFICATION, SparkRuntime.SPARK, ), ], ) -def test_detector_matches_expected_route_on_scala_fixture( - relative_path: str, expected_route: Route, expected_runtime: SparkRuntime +def test_detector_matches_expected_execution_on_scala_fixture( + relative_path: str, expected_execution: ToolExecution, expected_runtime: SparkRuntime ) -> None: fixture = CORE_FIXTURES / relative_path if not fixture.exists(): pytest.skip(f"fixture not available: {fixture}") # Fixtures are ~small; a generous budget keeps this test decisive. result = detect_spark_runtime(str(fixture), max_events_scanned=5000) - assert result.route is expected_route, result.reason + assert result.tool_execution is expected_execution, result.reason assert result.spark_runtime is expected_runtime, result.reason diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py index 4234b66f4..cb4823f2e 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py @@ -15,13 +15,11 @@ """Unit tests for ``eventlog_detector.markers``.""" # pylint: disable=too-few-public-methods # test classes naturally have few methods -import re - from spark_rapids_tools.tools.eventlog_detector import markers as m class TestGpuMarkers: - """Test GPU plugin marker constants.""" + """Test RAPIDS plugin marker constants.""" def test_plugin_substring_exact(self): assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" @@ -29,68 +27,15 @@ def test_plugin_substring_exact(self): def test_gpu_toggle_key_exact(self): assert m.GPU_ENABLED_KEY == "spark.rapids.sql.enabled" - -class TestAuronMarkers: - """Test Auron extension and enabled marker constants.""" - - def test_extension_regex_fullmatches_expected_value(self): - # Mirrors AuronParseHelper.extensionRegxMap. - pat = re.compile(m.AURON_EXTENSION_REGEX) - assert pat.fullmatch("org.apache.spark.sql.AuronSparkSessionExtension") - assert pat.fullmatch("whatever.AuronSparkSessionExtension.more") - assert not pat.fullmatch("org.apache.spark.sql.SomeOtherExtension") - - def test_auron_enabled_defaults_to_true(self): - assert m.AURON_ENABLED_DEFAULT == "true" - assert m.AURON_ENABLED_KEY == "spark.auron.enabled" - assert m.AURON_SPARK_EXTENSIONS_KEY == "spark.sql.extensions" - - -class TestDatabricksPrecondition: - """Test Databricks precondition key constants.""" - - def test_all_three_tag_keys_present(self): - assert m.DB_PRECONDITION_KEYS == ( - "spark.databricks.clusterUsageTags.clusterAllTags", - "spark.databricks.clusterUsageTags.clusterId", - "spark.databricks.clusterUsageTags.clusterName", - ) - - -class TestPhotonMarkers: - """Test Photon marker regex constants.""" - - def test_marker_map_fullmatches_expected(self): - pats = {k: re.compile(v) for k, v in m.PHOTON_MARKER_REGEX.items()} - assert pats[ - "spark.databricks.clusterUsageTags.sparkVersion" - ].fullmatch("11.3.x-photon-scala2.12") - assert pats[ - "spark.databricks.clusterUsageTags.runtimeEngine" - ].fullmatch("PHOTON") - assert not pats[ - "spark.databricks.clusterUsageTags.runtimeEngine" - ].fullmatch("STANDARD") - - def test_all_four_photon_keys(self): - assert set(m.PHOTON_MARKER_REGEX) == { - "spark.databricks.clusterUsageTags.sparkVersion", - "spark.databricks.clusterUsageTags.effectiveSparkVersion", - "spark.databricks.clusterUsageTags.sparkImageLabel", - "spark.databricks.clusterUsageTags.runtimeEngine", - } + def test_build_info_event_exact(self): + assert m.EVENT_SPARK_RAPIDS_BUILD_INFO == "com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent" -class TestDatabricksRollingFileName: - """Test Databricks rolling log file name pattern constants.""" +class TestOssRollingMarkers: + """Test Apache Spark rolling event-log layout constants.""" - def test_prefix_is_eventlog(self): - assert m.DB_EVENT_LOG_FILE_PREFIX == "eventlog" + def test_eventlog_v2_prefix_exact(self): + assert m.OSS_EVENT_LOG_DIR_PREFIX == "eventlog_v2_" - def test_date_pattern_parses_scala_format(self): - pat = re.compile(m.DB_EVENT_LOG_DATE_REGEX) - # Scala's getDBEventLogFileDate splits on '--' and parses - # 'eventlog-YYYY-MM-DD--HH-MM[.codec]'. - assert pat.search("eventlog-2021-06-14--20-00.gz") - assert pat.search("eventlog-2021-06-14--20-00") - assert not pat.search("eventlog") # bare eventlog has no date + def test_events_file_prefix_exact(self): + assert m.OSS_EVENT_LOG_FILE_PREFIX == "events_" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py index 8479adda6..f321a7251 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -15,42 +15,28 @@ """Unit tests for ``eventlog_detector.resolver``.""" # pylint: disable=too-few-public-methods # test classes naturally have few methods -from datetime import datetime from pathlib import Path import pytest from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector.resolver import ( - _parse_databricks_file_datetime, + _parse_oss_event_file_index, _resolve_event_log_files, ) from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError -class TestDatabricksDateParse: - """Test Databricks rolling log file date parsing.""" +class TestOssEventFileIndex: + """Test Apache Spark rolling event file index parsing.""" - def test_bare_eventlog_is_latest_sentinel(self): - # Returns None; caller treats None as "sort last". - assert _parse_databricks_file_datetime("eventlog") is None + def test_events_file_index_parses(self): + assert _parse_oss_event_file_index("events_1_app-1.zstd") == 1 + assert _parse_oss_event_file_index("events_10_app-1") == 10 - def test_dated_file_parses(self): - dt = _parse_databricks_file_datetime("eventlog-2021-06-14--20-00.gz") - assert dt == datetime(2021, 6, 14, 20, 0) - - def test_dated_file_without_codec_parses(self): - dt = _parse_databricks_file_datetime("eventlog-2022-01-02--03-04") - assert dt == datetime(2022, 1, 2, 3, 4) - - def test_non_eventlog_prefix_returns_none(self): - assert _parse_databricks_file_datetime("application_1234.log") is None - - def test_out_of_range_components_return_none(self): - # month=13 is syntactically 2 digits but not a valid month; - # datetime() would raise ValueError, we want None instead. - assert _parse_databricks_file_datetime("eventlog-2021-13-01--00-00") is None - assert _parse_databricks_file_datetime("eventlog-2021-02-30--25-99") is None + def test_non_events_file_returns_none(self): + assert _parse_oss_event_file_index("appstatus_app-1.inprogress") is None + assert _parse_oss_event_file_index("eventlog") is None class TestResolveSingleFile: @@ -64,36 +50,36 @@ def test_single_file_returns_single_element_list(self, tmp_path: Path): assert [p.base_name() for p in files] == ["eventlog.zstd"] -class TestResolveDatabricksRollingDir: - """Test resolving a Databricks rolling event log directory.""" +class TestResolveOssRollingDir: + """Test resolving an Apache Spark rolling event-log directory.""" - def test_orders_earliest_first_and_bare_eventlog_last(self, tmp_path: Path): - d = tmp_path / "dbrolling" + def test_orders_event_chunks_by_numeric_index(self, tmp_path: Path): + d = tmp_path / "eventlog_v2_app-1" d.mkdir() - (d / "eventlog").write_bytes(b"") - (d / "eventlog-2021-06-14--20-00.gz").write_bytes(b"") - (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") + (d / "events_10_app-1.zstd").write_bytes(b"") + (d / "events_2_app-1.zstd").write_bytes(b"") + (d / "events_1_app-1.zstd").write_bytes(b"") + (d / "appstatus_app-1.inprogress").write_bytes(b"") source, files = _resolve_event_log_files(CspPath(str(d))) assert source == str(d) - names = [p.base_name() for p in files] - # Earliest dated file first; bare `eventlog` sorts last (treated as - # "current/latest" per Scala). - assert names == [ - "eventlog-2021-06-14--18-00.gz", - "eventlog-2021-06-14--20-00.gz", - "eventlog", + assert [p.base_name() for p in files] == [ + "events_1_app-1.zstd", + "events_2_app-1.zstd", + "events_10_app-1.zstd", ] - def test_dir_with_no_eventlog_prefix_raises(self, tmp_path: Path): - d = tmp_path / "empty" + def test_accepts_trailing_slash_on_rolling_dir(self, tmp_path: Path): + d = tmp_path / "eventlog_v2_app-1" d.mkdir() - (d / "application_1.log").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) + (d / "events_1_app-1.zstd").write_bytes(b"") + source, files = _resolve_event_log_files(CspPath(f"{d}/")) + assert source.rstrip("/") == str(d) + assert [p.base_name() for p in files] == ["events_1_app-1.zstd"] - def test_empty_dir_raises(self, tmp_path: Path): - d = tmp_path / "blank" + def test_empty_oss_rolling_dir_raises(self, tmp_path: Path): + d = tmp_path / "eventlog_v2_app-1" d.mkdir() + (d / "appstatus_app-1.inprogress").write_bytes(b"") with pytest.raises(UnsupportedInputError): _resolve_event_log_files(CspPath(str(d))) @@ -101,10 +87,11 @@ def test_empty_dir_raises(self, tmp_path: Path): class TestResolveUnsupportedShapes: """Test that unsupported directory shapes raise UnsupportedInputError.""" - def test_spark_native_rolling_dir_raises(self, tmp_path: Path): - d = tmp_path / "eventlog_v2_local-1623876083964" + def test_non_oss_rolling_dir_raises(self, tmp_path: Path): + d = tmp_path / "non_oss_rolling" d.mkdir() - (d / "events_1_local-1623876083964").write_bytes(b"") + (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") + (d / "eventlog").write_bytes(b"") with pytest.raises(UnsupportedInputError): _resolve_event_log_files(CspPath(str(d))) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py index 90c243c71..53c053514 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py @@ -26,8 +26,6 @@ from spark_rapids_tools.tools.eventlog_detector.types import Termination -# ---------- Line builders ---------- - def env_update(props: dict) -> str: return json.dumps( { @@ -54,6 +52,18 @@ def app_start(app_id: str = "app-1", app_name: str = "App") -> str: ) +def rapids_build_info() -> str: + return json.dumps( + { + "Event": "com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent", + "sparkRapidsBuildInfo": {"version": "24.06.0"}, + "sparkRapidsJniBuildInfo": {}, + "cudfBuildInfo": {}, + "sparkRapidsPrivateBuildInfo": {}, + } + ) + + def sql_exec_start(modified_configs: dict) -> str: return json.dumps( { @@ -69,11 +79,16 @@ def sql_exec_start(modified_configs: dict) -> str: ) -# ---------- Tests for _scan_events (single stream) ---------- - class TestScanEvents: """Tests for _scan_events scanning a single event stream.""" + def test_build_info_event_is_decisive_before_env_update(self): + lines = iter([log_start(), rapids_build_info(), app_start()]) + result = _scan_events(lines, budget=100) + assert result.rapids_build_info_seen is True + assert result.termination is Termination.DECISIVE + assert result.events_scanned == 2 + def test_env_update_with_gpu_is_decisive(self): lines = iter( [ @@ -88,7 +103,7 @@ def test_env_update_with_gpu_is_decisive(self): assert result.spark_version == "3.5.1" assert result.termination is Termination.DECISIVE - def test_env_update_cpu_then_sql_start_flips_to_gpu(self): + def test_cpu_fast_path_stops_at_env_update_by_default(self): lines = iter( [ log_start(), @@ -98,14 +113,29 @@ def test_env_update_cpu_then_sql_start_flips_to_gpu(self): ] ) result = _scan_events(lines, budget=100) - assert result.termination is Termination.DECISIVE + assert result.env_update_seen is True + assert result.termination is Termination.CPU_FAST_PATH + assert result.events_scanned == 3 - def test_cpu_only_to_eof_is_exhausted(self): + def test_cpu_fast_path_can_be_disabled(self): lines = iter([log_start(), app_start(), env_update({"spark.master": "local"})]) - result = _scan_events(lines, budget=100) + result = _scan_events(lines, budget=100, allow_cpu_fast_path=False) assert result.env_update_seen is True assert result.termination is Termination.EXHAUSTED + def test_fast_path_ignored_when_rapids_marker_present(self): + lines = iter( + [ + env_update({ + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + }), + sql_exec_start({"spark.rapids.sql.enabled": "true"}), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.DECISIVE + def test_no_env_update_within_budget_is_cap_hit(self): # Budget less than the number of events, none of them env-update. lines = iter([log_start()] * 5) @@ -143,11 +173,23 @@ def test_later_sql_start_overwrites_earlier_property(self): ) result = _scan_events(lines, budget=100) assert result.termination is Termination.DECISIVE - # Final accumulated props reflect the merge. assert result.spark_properties["spark.rapids.sql.enabled"] == "true" + def test_sql_start_classifies_after_full_modified_config_merge(self): + lines = iter( + [ + env_update({"spark.rapids.sql.enabled": "false"}), + sql_exec_start({ + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "false", + }), + ] + ) + result = _scan_events(lines, budget=100) + assert result.termination is Termination.EXHAUSTED + assert result.spark_properties["spark.plugins"] == "com.nvidia.spark.SQLPlugin" + assert result.spark_properties["spark.rapids.sql.enabled"] == "false" -# ---------- Tests for _scan_events_across (multi-file) ---------- def _write(path: Path, lines: List[str]) -> CspPath: path.write_text("\n".join(lines) + "\n", encoding="utf-8") @@ -159,22 +201,45 @@ class TestScanEventsAcross: def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): f1 = _write( - tmp_path / "eventlog-2021-06-14--18-00", - [log_start(), app_start(), env_update({"spark.master": "local"})], + tmp_path / "events_1_app-1", + [log_start(), app_start(), env_update({"spark.rapids.sql.enabled": "false"})], + ) + f2 = _write(tmp_path / "events_2_app-1", [rapids_build_info()]) + result = _scan_events_across([f1, f2], budget=100) + assert result.termination is Termination.DECISIVE + assert result.last_scanned_path == str(f2) + + def test_cpu_fast_path_applies_across_files_when_no_rapids_markers(self, tmp_path): + f1 = _write(tmp_path / "events_1_app-1", [env_update({"spark.master": "local"})]) + f2 = _write(tmp_path / "events_2_app-1", [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})]) + result = _scan_events_across([f1, f2], budget=100) + assert result.termination is Termination.CPU_FAST_PATH + assert result.last_scanned_path == str(f1) + + def test_cpu_fast_path_skips_when_rapids_marker_present_across_files(self, tmp_path): + f1 = _write( + tmp_path / "events_1_app-1", + [env_update({"spark.rapids.sql.enabled": "false"})], ) f2 = _write( - tmp_path / "eventlog-2021-06-14--20-00", - [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})], + tmp_path / "events_2_app-1", + [sql_exec_start({ + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "true", + })], ) result = _scan_events_across([f1, f2], budget=100) assert result.termination is Termination.DECISIVE + assert result.last_scanned_path == str(f2) def test_shared_budget_applied_across_files(self, tmp_path): # 3 events in first file, 3 in second. Budget = 4. Second file stops # after one event, before any GPU signal. - f1 = _write(tmp_path / "a", [log_start(), app_start(), env_update({"spark.master": "local"})]) + f1 = _write(tmp_path / "events_1_app-1", [log_start(), app_start(), env_update({ + "spark.rapids.sql.enabled": "false", + })]) f2 = _write( - tmp_path / "b", + tmp_path / "events_2_app-1", [ sql_exec_start({"spark.master": "still-cpu"}), sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), @@ -185,6 +250,6 @@ def test_shared_budget_applied_across_files(self, tmp_path): assert result.termination is Termination.CAP_HIT def test_all_files_exhausted_returns_exhausted(self, tmp_path): - f1 = _write(tmp_path / "a", [env_update({"spark.master": "local"})]) - result = _scan_events_across([f1], budget=100) + f1 = _write(tmp_path / "events_1_app-1", [env_update({"spark.master": "local"})]) + result = _scan_events_across([f1], budget=100, allow_cpu_fast_path=False) assert result.termination is Termination.EXHAUSTED diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py index 7e050d6f9..4039de7f0 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py @@ -21,36 +21,36 @@ DetectionResult, EventLogDetectionError, EventLogReadError, - Route, SparkRuntime, Termination, + ToolExecution, UnsupportedCompressionError, UnsupportedInputError, ) -class TestRoute: - """Test the Route string enum.""" +class TestToolExecution: + """Test the ToolExecution string enum.""" - def test_has_three_values(self): - assert {r.value for r in Route} == {"QUALIFICATION", "PROFILING", "UNKNOWN"} + def test_has_expected_values(self): + assert {r.value for r in ToolExecution} == { + "QUALIFICATION", + "PROFILING", + "UNKNOWN", + } def test_is_string_enum(self): # str subclass means aether can compare against plain strings. - assert Route.PROFILING == "PROFILING" + assert ToolExecution.PROFILING == "PROFILING" class TestSparkRuntime: - """Test the SparkRuntime string enum.""" + """Test the reduced SparkRuntime string enum.""" - def test_values_match_scala_enum_exactly(self): - # These strings match org.apache.spark.sql.rapids.tool.util.SparkRuntime - # which aether already persists in JobRun.spark_runtime. + def test_values_cover_spark_and_rapids_only(self): assert {r.value for r in SparkRuntime} == { "SPARK", "SPARK_RAPIDS", - "PHOTON", - "AURON", } def test_is_string_enum(self): @@ -60,8 +60,13 @@ def test_is_string_enum(self): class TestTermination: """Test the Termination enum modes.""" - def test_has_three_modes(self): - assert {t.name for t in Termination} == {"DECISIVE", "EXHAUSTED", "CAP_HIT"} + def test_has_expected_modes(self): + assert {t.name for t in Termination} == { + "DECISIVE", + "CPU_FAST_PATH", + "EXHAUSTED", + "CAP_HIT", + } class TestDetectionResult: @@ -69,7 +74,7 @@ class TestDetectionResult: def test_frozen_dataclass(self): result = DetectionResult( - route=Route.PROFILING, + tool_execution=ToolExecution.PROFILING, spark_runtime=SparkRuntime.SPARK_RAPIDS, app_id="app-1", spark_version="3.5.1", @@ -80,17 +85,17 @@ def test_frozen_dataclass(self): # Python raises FrozenInstanceError (a subclass of AttributeError) # when you try to assign to a field on a frozen dataclass. with pytest.raises(AttributeError): - result.route = Route.UNKNOWN # type: ignore[misc] + result.tool_execution = ToolExecution.UNKNOWN # type: ignore[misc] def test_structural_equality(self): kwargs = { - "route": Route.QUALIFICATION, + "tool_execution": ToolExecution.QUALIFICATION, "spark_runtime": SparkRuntime.SPARK, "app_id": "a", "spark_version": "3.5.1", "event_log_path": "/tmp/a", "source_path": "/tmp/a", - "reason": "walked full log, no GPU-family signal", + "reason": "walked full log, no RAPIDS signal", } assert DetectionResult(**kwargs) == DetectionResult(**kwargs) assert hash(DetectionResult(**kwargs)) == hash(DetectionResult(**kwargs)) @@ -100,7 +105,7 @@ def test_structural_equality(self): def test_accepts_optional_fields_as_none(self): result = DetectionResult( - route=Route.UNKNOWN, + tool_execution=ToolExecution.UNKNOWN, spark_runtime=None, app_id=None, spark_version=None, @@ -108,7 +113,7 @@ def test_accepts_optional_fields_as_none(self): source_path="/tmp/x", reason="no decisive signal within bounded scan", ) - assert result.route is Route.UNKNOWN + assert result.tool_execution is ToolExecution.UNKNOWN assert result.spark_runtime is None From e635db82522fcae71af22c90c673b7e026f4c678 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Mon, 27 Apr 2026 20:25:02 -0700 Subject: [PATCH 28/29] test: address event log detector review feedback Signed-off-by: Sayed Bilal Bari --- .../tools/eventlog_detector/classifier.py | 4 +- .../tools/eventlog_detector/detector.py | 12 ++-- .../tools/eventlog_detector/resolver.py | 11 ++-- .../tools/eventlog_detector/scanner.py | 30 ++++----- .../tools/eventlog_detector/stream.py | 2 +- .../eventlog_detector/test_classifier.py | 46 +++++++------ .../tools/eventlog_detector/test_detector.py | 12 +++- .../test_detector_fixtures.py | 2 + .../tools/eventlog_detector/test_markers.py | 41 ------------ .../tools/eventlog_detector/test_resolver.py | 26 ++++---- .../tools/eventlog_detector/test_scanner.py | 63 +++++++++--------- .../tools/eventlog_detector/test_stream.py | 22 +++---- .../tools/eventlog_detector/test_types.py | 64 ------------------- 13 files changed, 121 insertions(+), 214 deletions(-) delete mode 100644 user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py index 4751dbb1c..d27a831ab 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/classifier.py @@ -52,7 +52,7 @@ def _is_spark_rapids(props: Mapping[str, str]) -> bool: return _parse_bool(raw, default=m.GPU_ENABLED_DEFAULT) -def _has_rapids_conf_markers(props: Mapping[str, str]) -> bool: +def has_rapids_conf_markers(props: Mapping[str, str]) -> bool: """Return true when properties contain any RAPIDS-related configuration. This is intentionally broader than ``_is_spark_rapids``. A disabled or @@ -65,7 +65,7 @@ def _has_rapids_conf_markers(props: Mapping[str, str]) -> bool: return m.GPU_ENABLED_KEY in props -def _classify_runtime(props: Mapping[str, str]) -> SparkRuntime: +def classify_runtime(props: Mapping[str, str]) -> SparkRuntime: """Classify accumulated Spark properties into the supported runtime enum.""" if _is_spark_rapids(props): return SparkRuntime.SPARK_RAPIDS diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py index aff1fd4f0..bcf4de1ae 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/detector.py @@ -17,9 +17,9 @@ from typing import Optional, Union from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.classifier import _classify_runtime -from spark_rapids_tools.tools.eventlog_detector.resolver import _resolve_event_log_files -from spark_rapids_tools.tools.eventlog_detector.scanner import _scan_events_across +from spark_rapids_tools.tools.eventlog_detector.classifier import classify_runtime +from spark_rapids_tools.tools.eventlog_detector.resolver import resolve_event_log_files +from spark_rapids_tools.tools.eventlog_detector.scanner import scan_events_across from spark_rapids_tools.tools.eventlog_detector.types import ( DetectionResult, SparkRuntime, @@ -52,9 +52,9 @@ def detect_spark_runtime( # would otherwise be stripped by CspPath normalisation). source_path = event_log if isinstance(event_log, str) else str(event_log) path = event_log if isinstance(event_log, CspPath) else CspPath(str(event_log)) - _, files = _resolve_event_log_files(path) + _, files = resolve_event_log_files(path) - scan = _scan_events_across( + scan = scan_events_across( files, budget=max_events_scanned, allow_cpu_fast_path=allow_cpu_fast_path, @@ -64,7 +64,7 @@ def detect_spark_runtime( if scan.rapids_build_info_seen: runtime = SparkRuntime.SPARK_RAPIDS elif scan.env_update_seen: - runtime = _classify_runtime(scan.spark_properties) + runtime = classify_runtime(scan.spark_properties) else: runtime = None diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py index 1ea632c56..6b45de710 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/resolver.py @@ -25,7 +25,7 @@ _OSS_EVENT_FILE_PATTERN = re.compile(r"^events_(\d+)_.*") -def _parse_oss_event_file_index(name: str) -> Optional[int]: +def parse_oss_event_file_index(name: str) -> Optional[int]: """Return the numeric chunk index from ``events__...`` files.""" match = _OSS_EVENT_FILE_PATTERN.match(name) if match is None: @@ -34,7 +34,7 @@ def _parse_oss_event_file_index(name: str) -> Optional[int]: def _is_oss_event_log_file(path: CspPath) -> bool: - return _parse_oss_event_file_index(path.base_name()) is not None + return parse_oss_event_file_index(path.base_name()) is not None def _base_name_from_source(source: str) -> str: @@ -42,7 +42,7 @@ def _base_name_from_source(source: str) -> str: return source.rstrip("/").rsplit("/", 1)[-1] -def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: +def resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: """Resolve ``path`` to an ordered list of files to scan. Supported inputs are a single concrete file or an Apache Spark rolling @@ -62,7 +62,8 @@ def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: if not _base_name_from_source(source).startswith(m.OSS_EVENT_LOG_DIR_PREFIX): raise UnsupportedInputError( f"Directory {source} is not a supported input shape. Only single " - "files and Apache Spark rolling event-log directories are handled " + f"files and Apache Spark rolling event-log directories named " + f"{m.OSS_EVENT_LOG_DIR_PREFIX}* are handled " "here; use the full pipeline for other shapes." ) @@ -72,7 +73,7 @@ def _resolve_event_log_files(path: CspPath) -> Tuple[str, List[CspPath]]: event_files.sort( key=lambda f: ( - _parse_oss_event_file_index(f.base_name()) or 0, + parse_oss_event_file_index(f.base_name()) or 0, f.base_name(), ) ) diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py index 4d15e3431..113e681a4 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/scanner.py @@ -25,10 +25,10 @@ from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector import markers as m from spark_rapids_tools.tools.eventlog_detector.classifier import ( - _classify_runtime, - _has_rapids_conf_markers, + classify_runtime, + has_rapids_conf_markers, ) -from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.stream import open_event_log_stream from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime, Termination @@ -36,7 +36,6 @@ class _ScanResult: spark_properties: Dict[str, str] = field(default_factory=dict) app_id: Optional[str] = None - app_name: Optional[str] = None spark_version: Optional[str] = None env_update_seen: bool = False rapids_build_info_seen: bool = False @@ -45,7 +44,7 @@ class _ScanResult: last_scanned_path: Optional[str] = None -def _scan_events( +def scan_events( lines: Iterable[str], *, budget: int, @@ -55,8 +54,8 @@ def _scan_events( """Scan one stream of lines, optionally continuing from a prior state. Terminates as ``DECISIVE`` on the first non-SPARK classification, - ``CAP_HIT`` when ``budget`` is exhausted, or ``EXHAUSTED`` when the - iterator runs out. + ``CPU_FAST_PATH`` after plain Spark startup properties, ``CAP_HIT`` when + ``budget`` is exhausted, or ``EXHAUSTED`` when the iterator runs out. """ result = state if state is not None else _ScanResult() @@ -91,11 +90,8 @@ def _scan_events( result.spark_version = version elif name == m.EVENT_APPLICATION_START: app_id = event.get("App ID") - app_name = event.get("App Name") if isinstance(app_id, str): result.app_id = app_id - if isinstance(app_name, str): - result.app_name = app_name elif name == m.EVENT_ENVIRONMENT_UPDATE: props = event.get("Spark Properties") or {} if isinstance(props, dict): @@ -103,11 +99,11 @@ def _scan_events( if isinstance(k, str) and isinstance(v, str): result.spark_properties[k] = v result.env_update_seen = True - runtime = _classify_runtime(result.spark_properties) + runtime = classify_runtime(result.spark_properties) if runtime is not SparkRuntime.SPARK: result.termination = Termination.DECISIVE return result - if allow_cpu_fast_path and not _has_rapids_conf_markers(result.spark_properties): + if allow_cpu_fast_path and not has_rapids_conf_markers(result.spark_properties): result.termination = Termination.CPU_FAST_PATH return result elif name in (m.EVENT_SQL_EXECUTION_START, m.EVENT_SQL_EXECUTION_START_SHORTNAME): @@ -116,8 +112,10 @@ def _scan_events( for k, v in modified.items(): if isinstance(k, str) and isinstance(v, str): result.spark_properties[k] = v + # Per-query configs refine startup properties; without env-update + # context they are not enough to classify the whole event log. if result.env_update_seen and ( - _classify_runtime(result.spark_properties) is not SparkRuntime.SPARK + classify_runtime(result.spark_properties) is not SparkRuntime.SPARK ): result.termination = Termination.DECISIVE return result @@ -126,7 +124,7 @@ def _scan_events( return result -def _scan_events_across( +def scan_events_across( files: List[CspPath], *, budget: int, @@ -139,8 +137,8 @@ def _scan_events_across( state.termination = Termination.CAP_HIT return state state.last_scanned_path = str(path) - with _open_event_log_stream(path) as lines: - state = _scan_events( + with open_event_log_stream(path) as lines: + state = scan_events( lines, budget=budget, allow_cpu_fast_path=allow_cpu_fast_path, diff --git a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py index 900f97150..719e994dc 100644 --- a/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py +++ b/user_tools/src/spark_rapids_tools/tools/eventlog_detector/stream.py @@ -58,7 +58,7 @@ def _classify_suffix(path: CspPath) -> str: @contextlib.contextmanager -def _open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: +def open_event_log_stream(path: CspPath) -> Iterator[Iterator[str]]: suffix = _classify_suffix(path) if suffix not in _SUPPORTED_SUFFIXES: raise UnsupportedCompressionError( diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index 8735c0985..57d99094b 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -18,8 +18,8 @@ import pytest from spark_rapids_tools.tools.eventlog_detector.classifier import ( - _classify_runtime, - _has_rapids_conf_markers, + classify_runtime, + has_rapids_conf_markers, ) from spark_rapids_tools.tools.eventlog_detector.types import SparkRuntime @@ -28,25 +28,38 @@ class TestEmptyProperties: """Test classification with an empty properties dict.""" def test_empty_props_is_spark(self): - assert _classify_runtime({}) is SparkRuntime.SPARK + assert classify_runtime({}) is SparkRuntime.SPARK class TestRapidsConfigMarkers: """Test marker presence checks used by the CPU fast path.""" def test_plain_spark_props_have_no_rapids_markers(self): - assert _has_rapids_conf_markers({"spark.master": "local"}) is False + assert has_rapids_conf_markers({"spark.master": "local"}) is False def test_disabled_rapids_plugin_still_counts_as_marker(self): props = { "spark.plugins": "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.enabled": "false", } - assert _classify_runtime(props) is SparkRuntime.SPARK - assert _has_rapids_conf_markers(props) is True + assert has_rapids_conf_markers(props) is True def test_rapids_enabled_key_alone_counts_as_marker(self): - assert _has_rapids_conf_markers({"spark.rapids.sql.enabled": "true"}) is True + assert has_rapids_conf_markers({"spark.rapids.sql.enabled": "true"}) is True + + @pytest.mark.parametrize("props", [ + {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, + {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin"}, + { + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "true", + }, + ]) + def test_rapids_plugin_configs_count_as_markers(self, props): + assert has_rapids_conf_markers(props) is True + + def test_lookalike_plugin_class_does_not_count_as_marker(self): + assert has_rapids_conf_markers({"spark.plugins": "foo.nvidia.spark.SQLPlugin"}) is False class TestSparkRapids: @@ -54,34 +67,27 @@ class TestSparkRapids: def test_plugin_and_default_enabled(self): props = {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin,bar"} - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + assert classify_runtime(props) is SparkRuntime.SPARK_RAPIDS def test_plugin_with_enabled_true(self): props = { "spark.plugins": "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.enabled": "true", } - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + assert classify_runtime(props) is SparkRuntime.SPARK_RAPIDS def test_plugin_with_enabled_false_demotes_to_spark(self): props = { "spark.plugins": "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.enabled": "false", } - assert _classify_runtime(props) is SparkRuntime.SPARK + assert classify_runtime(props) is SparkRuntime.SPARK def test_enabled_true_without_plugin_is_still_spark(self): props = {"spark.rapids.sql.enabled": "true"} - assert _classify_runtime(props) is SparkRuntime.SPARK - - def test_unparseable_enabled_defaults_to_true(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "not-a-bool", - } - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + assert classify_runtime(props) is SparkRuntime.SPARK - @pytest.mark.parametrize("bogus_value", ["no", "0", "yes", "1", "", "maybe"]) + @pytest.mark.parametrize("bogus_value", ["no", "0", "yes", "1", "", "maybe", "not-a-bool"]) def test_non_toboolean_values_default_to_true_matching_scala(self, bogus_value): # Scala: Try { "no".toBoolean }.getOrElse(true) == true because # "no" is not parseable. The Python classifier must do the same. @@ -89,4 +95,4 @@ def test_non_toboolean_values_default_to_true_matching_scala(self, bogus_value): "spark.plugins": "com.nvidia.spark.SQLPlugin", "spark.rapids.sql.enabled": bogus_value, } - assert _classify_runtime(props) is SparkRuntime.SPARK_RAPIDS + assert classify_runtime(props) is SparkRuntime.SPARK_RAPIDS diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py index 9867d9f22..27998a882 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -140,10 +140,16 @@ def test_env_update_without_rapids_markers_returns_qualification(self, tmp_path) def test_fast_path_can_be_disabled(self, tmp_path): log = tmp_path / "eventlog" - _write_plain_log(log, [env_update({"spark.master": "local"})]) + _write_plain_log( + log, + [ + env_update({"spark.master": "local"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ], + ) result = detect_spark_runtime(str(log), allow_cpu_fast_path=False) - assert result.tool_execution is ToolExecution.QUALIFICATION - assert "walked full log" in result.reason.lower() + assert result.tool_execution is ToolExecution.PROFILING + assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS def test_fast_path_does_not_fire_when_rapids_marker_present(self, tmp_path): log = tmp_path / "eventlog" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py index c36814b0f..97e90afd6 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector_fixtures.py @@ -39,6 +39,8 @@ SparkRuntime.SPARK_RAPIDS, ), ( + # The Profiling tool can process CPU logs; this fixture lives under + # profiling resources but has no RAPIDS runtime markers. "spark-events-profiling/eventlog_dsv2.zstd", ToolExecution.QUALIFICATION, SparkRuntime.SPARK, diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py deleted file mode 100644 index cb4823f2e..000000000 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_markers.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Unit tests for ``eventlog_detector.markers``.""" -# pylint: disable=too-few-public-methods # test classes naturally have few methods - -from spark_rapids_tools.tools.eventlog_detector import markers as m - - -class TestGpuMarkers: - """Test RAPIDS plugin marker constants.""" - - def test_plugin_substring_exact(self): - assert m.GPU_PLUGIN_CLASS_SUBSTRING == "com.nvidia.spark.SQLPlugin" - - def test_gpu_toggle_key_exact(self): - assert m.GPU_ENABLED_KEY == "spark.rapids.sql.enabled" - - def test_build_info_event_exact(self): - assert m.EVENT_SPARK_RAPIDS_BUILD_INFO == "com.nvidia.spark.rapids.SparkRapidsBuildInfoEvent" - - -class TestOssRollingMarkers: - """Test Apache Spark rolling event-log layout constants.""" - - def test_eventlog_v2_prefix_exact(self): - assert m.OSS_EVENT_LOG_DIR_PREFIX == "eventlog_v2_" - - def test_events_file_prefix_exact(self): - assert m.OSS_EVENT_LOG_FILE_PREFIX == "events_" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py index f321a7251..d15af7155 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -21,8 +21,8 @@ from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector.resolver import ( - _parse_oss_event_file_index, - _resolve_event_log_files, + parse_oss_event_file_index, + resolve_event_log_files, ) from spark_rapids_tools.tools.eventlog_detector.types import UnsupportedInputError @@ -31,12 +31,12 @@ class TestOssEventFileIndex: """Test Apache Spark rolling event file index parsing.""" def test_events_file_index_parses(self): - assert _parse_oss_event_file_index("events_1_app-1.zstd") == 1 - assert _parse_oss_event_file_index("events_10_app-1") == 10 + assert parse_oss_event_file_index("events_1_app-1.zstd") == 1 + assert parse_oss_event_file_index("events_10_app-1") == 10 def test_non_events_file_returns_none(self): - assert _parse_oss_event_file_index("appstatus_app-1.inprogress") is None - assert _parse_oss_event_file_index("eventlog") is None + assert parse_oss_event_file_index("appstatus_app-1.inprogress") is None + assert parse_oss_event_file_index("eventlog") is None class TestResolveSingleFile: @@ -45,7 +45,7 @@ class TestResolveSingleFile: def test_single_file_returns_single_element_list(self, tmp_path: Path): f = tmp_path / "eventlog.zstd" f.write_bytes(b"x") - source, files = _resolve_event_log_files(CspPath(str(f))) + source, files = resolve_event_log_files(CspPath(str(f))) assert source == str(f) assert [p.base_name() for p in files] == ["eventlog.zstd"] @@ -60,7 +60,7 @@ def test_orders_event_chunks_by_numeric_index(self, tmp_path: Path): (d / "events_2_app-1.zstd").write_bytes(b"") (d / "events_1_app-1.zstd").write_bytes(b"") (d / "appstatus_app-1.inprogress").write_bytes(b"") - source, files = _resolve_event_log_files(CspPath(str(d))) + source, files = resolve_event_log_files(CspPath(str(d))) assert source == str(d) assert [p.base_name() for p in files] == [ "events_1_app-1.zstd", @@ -72,7 +72,7 @@ def test_accepts_trailing_slash_on_rolling_dir(self, tmp_path: Path): d = tmp_path / "eventlog_v2_app-1" d.mkdir() (d / "events_1_app-1.zstd").write_bytes(b"") - source, files = _resolve_event_log_files(CspPath(f"{d}/")) + source, files = resolve_event_log_files(CspPath(f"{d}/")) assert source.rstrip("/") == str(d) assert [p.base_name() for p in files] == ["events_1_app-1.zstd"] @@ -81,7 +81,7 @@ def test_empty_oss_rolling_dir_raises(self, tmp_path: Path): d.mkdir() (d / "appstatus_app-1.inprogress").write_bytes(b"") with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) + resolve_event_log_files(CspPath(str(d))) class TestResolveUnsupportedShapes: @@ -92,8 +92,8 @@ def test_non_oss_rolling_dir_raises(self, tmp_path: Path): d.mkdir() (d / "eventlog-2021-06-14--18-00.gz").write_bytes(b"") (d / "eventlog").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) + with pytest.raises(UnsupportedInputError, match="eventlog_v2_\\*"): + resolve_event_log_files(CspPath(str(d))) def test_generic_multi_app_dir_raises(self, tmp_path: Path): d = tmp_path / "multi" @@ -101,4 +101,4 @@ def test_generic_multi_app_dir_raises(self, tmp_path: Path): (d / "app-1.zstd").write_bytes(b"") (d / "app-2.zstd").write_bytes(b"") with pytest.raises(UnsupportedInputError): - _resolve_event_log_files(CspPath(str(d))) + resolve_event_log_files(CspPath(str(d))) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py index 53c053514..a30e0d8f7 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_scanner.py @@ -20,8 +20,8 @@ from spark_rapids_tools.storagelib import CspPath from spark_rapids_tools.tools.eventlog_detector.scanner import ( - _scan_events, - _scan_events_across, + scan_events, + scan_events_across, ) from spark_rapids_tools.tools.eventlog_detector.types import Termination @@ -80,16 +80,18 @@ def sql_exec_start(modified_configs: dict) -> str: class TestScanEvents: - """Tests for _scan_events scanning a single event stream.""" + """Tests for scan_events scanning a single event stream.""" def test_build_info_event_is_decisive_before_env_update(self): lines = iter([log_start(), rapids_build_info(), app_start()]) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.rapids_build_info_seen is True assert result.termination is Termination.DECISIVE assert result.events_scanned == 2 def test_env_update_with_gpu_is_decisive(self): + # Scala tools and the RAPIDS plugin both default spark.rapids.sql.enabled to true, + # so the plugin marker alone is enough to classify the runtime as RAPIDS. lines = iter( [ log_start(), @@ -97,7 +99,7 @@ def test_env_update_with_gpu_is_decisive(self): env_update({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), ] ) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.env_update_seen is True assert result.app_id == "app-1" assert result.spark_version == "3.5.1" @@ -112,16 +114,24 @@ def test_cpu_fast_path_stops_at_env_update_by_default(self): sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), ] ) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.env_update_seen is True assert result.termination is Termination.CPU_FAST_PATH assert result.events_scanned == 3 def test_cpu_fast_path_can_be_disabled(self): - lines = iter([log_start(), app_start(), env_update({"spark.master": "local"})]) - result = _scan_events(lines, budget=100, allow_cpu_fast_path=False) + lines = iter( + [ + log_start(), + app_start(), + env_update({"spark.master": "local"}), + sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"}), + ] + ) + result = scan_events(lines, budget=100, allow_cpu_fast_path=False) assert result.env_update_seen is True - assert result.termination is Termination.EXHAUSTED + assert result.termination is Termination.DECISIVE + assert result.events_scanned == 4 def test_fast_path_ignored_when_rapids_marker_present(self): lines = iter( @@ -133,19 +143,20 @@ def test_fast_path_ignored_when_rapids_marker_present(self): sql_exec_start({"spark.rapids.sql.enabled": "true"}), ] ) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.termination is Termination.DECISIVE + assert result.spark_properties["spark.rapids.sql.enabled"] == "true" def test_no_env_update_within_budget_is_cap_hit(self): # Budget less than the number of events, none of them env-update. lines = iter([log_start()] * 5) - result = _scan_events(lines, budget=2) + result = scan_events(lines, budget=2) assert result.env_update_seen is False assert result.termination is Termination.CAP_HIT def test_no_env_update_to_eof_is_exhausted_without_env(self): lines = iter([log_start(), app_start()]) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.env_update_seen is False assert result.termination is Termination.EXHAUSTED @@ -159,22 +170,10 @@ def test_malformed_json_lines_are_skipped(self): env_update({"spark.master": "local"}), ] ) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.env_update_seen is True assert result.app_id == "app-1" - def test_later_sql_start_overwrites_earlier_property(self): - # last-write-wins merge, matching CacheablePropsHandler.mergeModifiedConfigs. - lines = iter( - [ - env_update({"spark.rapids.sql.enabled": "false", "spark.plugins": "com.nvidia.spark.SQLPlugin"}), - sql_exec_start({"spark.rapids.sql.enabled": "true"}), - ] - ) - result = _scan_events(lines, budget=100) - assert result.termination is Termination.DECISIVE - assert result.spark_properties["spark.rapids.sql.enabled"] == "true" - def test_sql_start_classifies_after_full_modified_config_merge(self): lines = iter( [ @@ -185,7 +184,7 @@ def test_sql_start_classifies_after_full_modified_config_merge(self): }), ] ) - result = _scan_events(lines, budget=100) + result = scan_events(lines, budget=100) assert result.termination is Termination.EXHAUSTED assert result.spark_properties["spark.plugins"] == "com.nvidia.spark.SQLPlugin" assert result.spark_properties["spark.rapids.sql.enabled"] == "false" @@ -197,7 +196,7 @@ def _write(path: Path, lines: List[str]) -> CspPath: class TestScanEventsAcross: - """Tests for _scan_events_across scanning across multiple files.""" + """Tests for scan_events_across scanning across multiple files.""" def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): f1 = _write( @@ -205,14 +204,14 @@ def test_gpu_signal_in_second_file_is_decisive(self, tmp_path): [log_start(), app_start(), env_update({"spark.rapids.sql.enabled": "false"})], ) f2 = _write(tmp_path / "events_2_app-1", [rapids_build_info()]) - result = _scan_events_across([f1, f2], budget=100) + result = scan_events_across([f1, f2], budget=100) assert result.termination is Termination.DECISIVE assert result.last_scanned_path == str(f2) def test_cpu_fast_path_applies_across_files_when_no_rapids_markers(self, tmp_path): f1 = _write(tmp_path / "events_1_app-1", [env_update({"spark.master": "local"})]) f2 = _write(tmp_path / "events_2_app-1", [sql_exec_start({"spark.plugins": "com.nvidia.spark.SQLPlugin"})]) - result = _scan_events_across([f1, f2], budget=100) + result = scan_events_across([f1, f2], budget=100) assert result.termination is Termination.CPU_FAST_PATH assert result.last_scanned_path == str(f1) @@ -228,7 +227,7 @@ def test_cpu_fast_path_skips_when_rapids_marker_present_across_files(self, tmp_p "spark.rapids.sql.enabled": "true", })], ) - result = _scan_events_across([f1, f2], budget=100) + result = scan_events_across([f1, f2], budget=100) assert result.termination is Termination.DECISIVE assert result.last_scanned_path == str(f2) @@ -246,10 +245,10 @@ def test_shared_budget_applied_across_files(self, tmp_path): sql_exec_start({"x": "y"}), ], ) - result = _scan_events_across([f1, f2], budget=4) + result = scan_events_across([f1, f2], budget=4) assert result.termination is Termination.CAP_HIT def test_all_files_exhausted_returns_exhausted(self, tmp_path): f1 = _write(tmp_path / "events_1_app-1", [env_update({"spark.master": "local"})]) - result = _scan_events_across([f1], budget=100, allow_cpu_fast_path=False) + result = scan_events_across([f1], budget=100, allow_cpu_fast_path=False) assert result.termination is Termination.EXHAUSTED diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py index 098c611e3..ca8848a72 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -22,7 +22,7 @@ import zstandard as zstd from spark_rapids_tools.storagelib import CspPath -from spark_rapids_tools.tools.eventlog_detector.stream import _open_event_log_stream +from spark_rapids_tools.tools.eventlog_detector.stream import open_event_log_stream from spark_rapids_tools.tools.eventlog_detector.types import ( EventLogReadError, UnsupportedCompressionError, @@ -76,7 +76,7 @@ class TestPlainStream: """Test streaming plain-text event logs.""" def test_yields_all_lines(self, plain_file): # pylint: disable=redefined-outer-name - with _open_event_log_stream(plain_file) as lines: + with open_event_log_stream(plain_file) as lines: collected = list(lines) assert collected == SAMPLE_LINES @@ -85,7 +85,7 @@ class TestGzipStream: """Test streaming gzip-compressed event logs.""" def test_yields_all_lines(self, gz_file): # pylint: disable=redefined-outer-name - with _open_event_log_stream(gz_file) as lines: + with open_event_log_stream(gz_file) as lines: collected = list(lines) assert collected == SAMPLE_LINES @@ -94,14 +94,14 @@ class TestZstdStream: """Test streaming zstd-compressed event logs.""" def test_yields_all_lines(self, zstd_file): # pylint: disable=redefined-outer-name - with _open_event_log_stream(zstd_file) as lines: + with open_event_log_stream(zstd_file) as lines: collected = list(lines) assert collected == SAMPLE_LINES def test_zst_short_suffix_also_works(self, tmp_path): p = tmp_path / "eventlog.zst" _write_zstd(p) - with _open_event_log_stream(CspPath(str(p))) as lines: + with open_event_log_stream(CspPath(str(p))) as lines: collected = list(lines) assert collected == SAMPLE_LINES @@ -113,28 +113,28 @@ def test_lz4_raises(self, tmp_path): p = tmp_path / "eventlog.lz4" p.write_bytes(b"not-real-lz4") with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: + with open_event_log_stream(CspPath(str(p))) as _: pass def test_snappy_raises(self, tmp_path): p = tmp_path / "eventlog.snappy" p.write_bytes(b"not-real-snappy") with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: + with open_event_log_stream(CspPath(str(p))) as _: pass def test_lzf_raises(self, tmp_path): p = tmp_path / "eventlog.lzf" p.write_bytes(b"not-real-lzf") with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: + with open_event_log_stream(CspPath(str(p))) as _: pass def test_unknown_suffix_raises(self, tmp_path): p = tmp_path / "eventlog.weirdcodec" p.write_bytes(b"some-bytes") with pytest.raises(UnsupportedCompressionError): - with _open_event_log_stream(CspPath(str(p))) as _: + with open_event_log_stream(CspPath(str(p))) as _: pass @@ -144,7 +144,7 @@ class TestIoFailure: def test_missing_file_raises_read_error(self, tmp_path): p = tmp_path / "does-not-exist" with pytest.raises(EventLogReadError): - with _open_event_log_stream(CspPath(str(p))) as lines: + with open_event_log_stream(CspPath(str(p))) as lines: next(iter(lines)) def test_caller_side_exception_is_not_reclassified(self, plain_file): # pylint: disable=redefined-outer-name @@ -154,5 +154,5 @@ class _MarkerError(RuntimeError): pass with pytest.raises(_MarkerError): - with _open_event_log_stream(plain_file): + with open_event_log_stream(plain_file): raise _MarkerError("not an I/O failure") diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py index 4039de7f0..f470ac8af 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py @@ -15,14 +15,10 @@ """Unit tests for ``eventlog_detector.types``.""" # pylint: disable=too-few-public-methods # test classes naturally have few methods -import pytest - from spark_rapids_tools.tools.eventlog_detector.types import ( - DetectionResult, EventLogDetectionError, EventLogReadError, SparkRuntime, - Termination, ToolExecution, UnsupportedCompressionError, UnsupportedInputError, @@ -57,66 +53,6 @@ def test_is_string_enum(self): assert SparkRuntime.SPARK_RAPIDS == "SPARK_RAPIDS" -class TestTermination: - """Test the Termination enum modes.""" - - def test_has_expected_modes(self): - assert {t.name for t in Termination} == { - "DECISIVE", - "CPU_FAST_PATH", - "EXHAUSTED", - "CAP_HIT", - } - - -class TestDetectionResult: - """Test DetectionResult dataclass semantics.""" - - def test_frozen_dataclass(self): - result = DetectionResult( - tool_execution=ToolExecution.PROFILING, - spark_runtime=SparkRuntime.SPARK_RAPIDS, - app_id="app-1", - spark_version="3.5.1", - event_log_path="/tmp/x", - source_path="/tmp/x", - reason="decisive: classified as SPARK_RAPIDS", - ) - # Python raises FrozenInstanceError (a subclass of AttributeError) - # when you try to assign to a field on a frozen dataclass. - with pytest.raises(AttributeError): - result.tool_execution = ToolExecution.UNKNOWN # type: ignore[misc] - - def test_structural_equality(self): - kwargs = { - "tool_execution": ToolExecution.QUALIFICATION, - "spark_runtime": SparkRuntime.SPARK, - "app_id": "a", - "spark_version": "3.5.1", - "event_log_path": "/tmp/a", - "source_path": "/tmp/a", - "reason": "walked full log, no RAPIDS signal", - } - assert DetectionResult(**kwargs) == DetectionResult(**kwargs) - assert hash(DetectionResult(**kwargs)) == hash(DetectionResult(**kwargs)) - # Distinct payloads compare unequal. - other = DetectionResult(**{**kwargs, "app_id": "b"}) - assert DetectionResult(**kwargs) != other - - def test_accepts_optional_fields_as_none(self): - result = DetectionResult( - tool_execution=ToolExecution.UNKNOWN, - spark_runtime=None, - app_id=None, - spark_version=None, - event_log_path="/tmp/x", - source_path="/tmp/x", - reason="no decisive signal within bounded scan", - ) - assert result.tool_execution is ToolExecution.UNKNOWN - assert result.spark_runtime is None - - class TestExceptionHierarchy: """Test that all detector exceptions form a coherent hierarchy.""" From 11ae2ee38a3c80035a54db823ea9a0f5f8bb5684 Mon Sep 17 00:00:00 2001 From: Sayed Bilal Bari Date: Tue, 28 Apr 2026 14:03:27 -0700 Subject: [PATCH 29/29] test: trim event log detector coverage overlap Signed-off-by: Sayed Bilal Bari --- .../eventlog_detector/test_classifier.py | 42 +++++++------------ .../tools/eventlog_detector/test_detector.py | 33 --------------- .../tools/eventlog_detector/test_resolver.py | 8 ---- .../tools/eventlog_detector/test_stream.py | 26 ++---------- .../tools/eventlog_detector/test_types.py | 7 ---- 5 files changed, 19 insertions(+), 97 deletions(-) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py index 57d99094b..61ca65891 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_classifier.py @@ -34,32 +34,22 @@ def test_empty_props_is_spark(self): class TestRapidsConfigMarkers: """Test marker presence checks used by the CPU fast path.""" - def test_plain_spark_props_have_no_rapids_markers(self): - assert has_rapids_conf_markers({"spark.master": "local"}) is False - - def test_disabled_rapids_plugin_still_counts_as_marker(self): - props = { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "false", - } - assert has_rapids_conf_markers(props) is True - - def test_rapids_enabled_key_alone_counts_as_marker(self): - assert has_rapids_conf_markers({"spark.rapids.sql.enabled": "true"}) is True - - @pytest.mark.parametrize("props", [ - {"spark.plugins": "com.nvidia.spark.SQLPlugin"}, - {"spark.plugins": "foo,com.nvidia.spark.SQLPlugin"}, - { - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "true", - }, - ]) - def test_rapids_plugin_configs_count_as_markers(self, props): - assert has_rapids_conf_markers(props) is True - - def test_lookalike_plugin_class_does_not_count_as_marker(self): - assert has_rapids_conf_markers({"spark.plugins": "foo.nvidia.spark.SQLPlugin"}) is False + @pytest.mark.parametrize( + "props,expected", + [ + ({"spark.master": "local"}, False), + ({"spark.rapids.sql.enabled": "true"}, True), + ({"spark.plugins": "com.nvidia.spark.SQLPlugin"}, True), + ({"spark.plugins": "foo,com.nvidia.spark.SQLPlugin"}, True), + ({ + "spark.plugins": "com.nvidia.spark.SQLPlugin", + "spark.rapids.sql.enabled": "true", + }, True), + ({"spark.plugins": "foo.nvidia.spark.SQLPlugin"}, False), + ], + ) + def test_detects_rapids_marker_configs(self, props, expected): + assert has_rapids_conf_markers(props) is expected class TestSparkRapids: diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py index 27998a882..54400ee8a 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_detector.py @@ -151,22 +151,6 @@ def test_fast_path_can_be_disabled(self, tmp_path): assert result.tool_execution is ToolExecution.PROFILING assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS - def test_fast_path_does_not_fire_when_rapids_marker_present(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log( - log, - [ - env_update({ - "spark.plugins": "com.nvidia.spark.SQLPlugin", - "spark.rapids.sql.enabled": "false", - }), - sql_exec_start({"spark.rapids.sql.enabled": "true"}), - ], - ) - result = detect_spark_runtime(str(log)) - assert result.tool_execution is ToolExecution.PROFILING - assert result.spark_runtime is SparkRuntime.SPARK_RAPIDS - class TestCapHit: """Test detection when the event budget is exhausted before env-update.""" @@ -226,23 +210,6 @@ def test_non_oss_rolling_dir_raises(self, tmp_path): detect_spark_runtime(CspPath(str(d))) -class TestReasonStrings: - """Test the human-readable reason field on DetectionResult.""" - - def test_reason_mentions_runtime_on_profiling(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log(log, [build_info()]) - result = detect_spark_runtime(str(log)) - assert "SPARK_RAPIDS" in result.reason - - def test_reason_mentions_full_log_on_strict_qualification(self, tmp_path): - log = tmp_path / "eventlog" - _write_plain_log(log, [env_update({"spark.master": "local"})]) - result = detect_spark_runtime(str(log), allow_cpu_fast_path=False) - assert result.tool_execution is ToolExecution.QUALIFICATION - assert "walked full log" in result.reason.lower() - - class TestSourcePathPreserved: """Test that source_path echoes the original input string.""" diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py index d15af7155..b09c70378 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_resolver.py @@ -94,11 +94,3 @@ def test_non_oss_rolling_dir_raises(self, tmp_path: Path): (d / "eventlog").write_bytes(b"") with pytest.raises(UnsupportedInputError, match="eventlog_v2_\\*"): resolve_event_log_files(CspPath(str(d))) - - def test_generic_multi_app_dir_raises(self, tmp_path: Path): - d = tmp_path / "multi" - d.mkdir() - (d / "app-1.zstd").write_bytes(b"") - (d / "app-2.zstd").write_bytes(b"") - with pytest.raises(UnsupportedInputError): - resolve_event_log_files(CspPath(str(d))) diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py index ca8848a72..e43543ded 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_stream.py @@ -109,29 +109,9 @@ def test_zst_short_suffix_also_works(self, tmp_path): class TestUnsupportedCompression: """Test that unsupported compression formats raise UnsupportedCompressionError.""" - def test_lz4_raises(self, tmp_path): - p = tmp_path / "eventlog.lz4" - p.write_bytes(b"not-real-lz4") - with pytest.raises(UnsupportedCompressionError): - with open_event_log_stream(CspPath(str(p))) as _: - pass - - def test_snappy_raises(self, tmp_path): - p = tmp_path / "eventlog.snappy" - p.write_bytes(b"not-real-snappy") - with pytest.raises(UnsupportedCompressionError): - with open_event_log_stream(CspPath(str(p))) as _: - pass - - def test_lzf_raises(self, tmp_path): - p = tmp_path / "eventlog.lzf" - p.write_bytes(b"not-real-lzf") - with pytest.raises(UnsupportedCompressionError): - with open_event_log_stream(CspPath(str(p))) as _: - pass - - def test_unknown_suffix_raises(self, tmp_path): - p = tmp_path / "eventlog.weirdcodec" + @pytest.mark.parametrize("suffix", [".lz4", ".snappy", ".lzf", ".weirdcodec"]) + def test_unsupported_suffix_raises(self, tmp_path, suffix): + p = tmp_path / f"eventlog{suffix}" p.write_bytes(b"some-bytes") with pytest.raises(UnsupportedCompressionError): with open_event_log_stream(CspPath(str(p))) as _: diff --git a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py index f470ac8af..0df975e63 100644 --- a/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py +++ b/user_tools/tests/spark_rapids_tools_ut/tools/eventlog_detector/test_types.py @@ -63,10 +63,3 @@ def test_all_errors_subclass_base(self): EventLogReadError, ): assert issubclass(cls, EventLogDetectionError) - - def test_base_is_exception_not_value_error(self): - # We deliberately do NOT inherit ValueError: EventLogReadError wraps - # I/O failures (missing file, permissions), which are not bad-input - # errors. Keeping the base at Exception avoids that semantic mismatch. - assert issubclass(EventLogDetectionError, Exception) - assert not issubclass(EventLogDetectionError, ValueError)