[SPARK-54305][PySpark][Streaming] Add admission control support for Python streaming data sources

jiteshsoni · jiteshsoni · commit fb8c4826f04e · 2025-11-15T20:58:42.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -129,3 +129,8 @@ sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/gen/
 tpcds-sf-1/
 tpcds-sf-1-text/
 tpcds-kit/
+
+# Cursor AI configuration files (local development only)
+.cursorrules*
+PR_SPARK_GUIDELINES.MD
+SPARK_WORKFLOW.md
diff --git a/python/docs/source/sql_streaming.rst b/python/docs/source/sql_streaming.rst
@@ -0,0 +1,12 @@
+.. See also ``pyspark.sql.sources.DataSource.streamReader``.
+
+The parameter `read_limit` in `latestOffset` provides the read limit for the current batch.
+The implementation can use this information to cap the number of rows returned in the batch.
+For example, if the `read_limit` is `{"maxRows": 1000}`, the data source should not return
+more than 1000 rows. The available read limit types are:
+
+* `maxRows`: the maximum number of rows to return in a batch.
+* `minRows`: the minimum number of rows to return in a batch.
+* `maxBytes`: the maximum size in bytes to return in a batch.
+* `minBytes`: the minimum size in bytes to return in a batch.
+* `allAvailable`: return all available data in a batch.
diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
@@ -300,7 +300,7 @@ class Filter(ABC):
 
     +---------------------+--------------------------------------------+
     | SQL filter          | Representation                             |
-    +---------------------+--------------------------------------------+
+    +---------------------+---------------------------------------------+
     | `a.b.c = 1`         | `EqualTo(("a", "b", "c"), 1)`              |
     | `a = 1`             | `EqualTo(("a",), 1)`                       |
     | `a = 'hi'`          | `EqualTo(("a",), "hi")`                    |
@@ -685,56 +685,23 @@ def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["Re
 
 class DataSourceStreamReader(ABC):
     """
-    A base class for streaming data source readers. Data source stream readers are responsible
-    for outputting data from a streaming data source.
+    An interface for streaming data source.
 
     .. versionadded: 4.0.0
     """
 
-    def initialOffset(self) -> dict:
-        """
-        Return the initial offset of the streaming data source.
-        A new streaming query starts reading data from the initial offset.
-        If Spark is restarting an existing query, it will restart from the check-pointed offset
-        rather than the initial one.
-
-        Returns
-        -------
-        dict
-            A dict or recursive dict whose key and value are primitive types, which includes
-            Integer, String and Boolean.
-
-        Examples
-        --------
-        >>> def initialOffset(self):
-        ...     return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}
-        """
-        raise PySparkNotImplementedError(
-            errorClass="NOT_IMPLEMENTED",
-            messageParameters={"feature": "initialOffset"},
-        )
+    def initialOffset(self) -> str:
+        pass
 
-    def latestOffset(self) -> dict:
+    @abstractmethod
+    def latestOffset(self) -> str:
         """
         Returns the most recent offset available.
-
-        Returns
-        -------
-        dict
-            A dict or recursive dict whose key and value are primitive types, which includes
-            Integer, String and Boolean.
-
-        Examples
-        --------
-        >>> def latestOffset(self):
-        ...     return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}
         """
-        raise PySparkNotImplementedError(
-            errorClass="NOT_IMPLEMENTED",
-            messageParameters={"feature": "latestOffset"},
-        )
+        pass
 
-    def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]:
+    @abstractmethod
+    def partitions(self, start: str, end: str) -> List[bytes]:
         """
         Returns a list of InputPartition given the start and end offsets. Each InputPartition
         represents a data split that can be processed by one Spark task. This may be called with
diff --git a/python/pyspark/sql/datasource_internal.py b/python/pyspark/sql/datasource_internal.py
@@ -19,7 +19,7 @@
 import json
 import copy
 from itertools import chain
-from typing import Iterator, List, Optional, Sequence, Tuple
+from typing import Iterator, List, Optional, Sequence, Tuple, TYPE_CHECKING
 
 from pyspark.sql.datasource import (
     DataSource,
@@ -77,25 +77,22 @@ class _SimpleStreamReaderWrapper(DataSourceStreamReader):
     replayed by reading data between start and end offset through readBetweenOffsets(start, end).
     """
 
-    def __init__(self, simple_reader: SimpleDataSourceStreamReader):
-        self.simple_reader = simple_reader
-        self.initial_offset: Optional[dict] = None
-        self.current_offset: Optional[dict] = None
-        self.cache: List[PrefetchedCacheEntry] = []
+    def __init__(self, reader: "DataSourceStreamReader"):
+        self.reader = reader
 
     def initialOffset(self) -> dict:
-        if self.initial_offset is None:
-            self.initial_offset = self.simple_reader.initialOffset()
-        return self.initial_offset
-
-    def latestOffset(self) -> dict:
-        # when query start for the first time, use initial offset as the start offset.
-        if self.current_offset is None:
-            self.current_offset = self.initialOffset()
-        (iter, end) = self.simple_reader.read(self.current_offset)
-        self.cache.append(PrefetchedCacheEntry(self.current_offset, end, iter))
-        self.current_offset = end
-        return end
+        return self.reader.initialOffset()
+
+    def latestOffset(self, start: Optional[dict], read_limit: Dict) -> dict:
+        # For backward compatibility, `latestOffset` with two arguments is not an abstract method.
+        # If the user-defined stream reader does not implement that, it will fall back to
+        # the `latestOffset` with no argument.
+        if hasattr(self.reader, "latestOffset") and not isinstance(
+            self.reader, SimpleDataSourceStreamReader
+        ):
+            return self.reader.latestOffset(start, read_limit)
+        else:
+            return self.reader.latestOffset()
 
     def commit(self, end: dict) -> None:
         if self.current_offset is None:
diff --git a/python/pyspark/sql/streaming/python_streaming_source_runner.py b/python/pyspark/sql/streaming/python_streaming_source_runner.py
@@ -27,6 +27,8 @@
     write_int,
     write_with_length,
     SpecialLengths,
+    read_bool,
+    read_with_length,
 )
 from pyspark.sql.datasource import DataSource, DataSourceStreamReader
 from pyspark.sql.datasource_internal import _SimpleStreamReaderWrapper, _streamReader
@@ -169,7 +171,13 @@ def main(infile: IO, outfile: IO) -> None:
                 if func_id == INITIAL_OFFSET_FUNC_ID:
                     initial_offset_func(reader, outfile)
                 elif func_id == LATEST_OFFSET_FUNC_ID:
-                    latest_offset_func(reader, outfile)
+                    has_start = read_bool(infile)
+                    if has_start:
+                        start = read_with_length(infile)
+                    else:
+                        start = None
+                    read_limit = json.loads(read_with_length(infile))
+                    write_with_length(reader.latestOffset(start, read_limit))
                 elif func_id == PARTITIONS_FUNC_ID:
                     partitions_func(
                         reader, data_source, schema, max_arrow_batch_size, infile, outfile
diff --git a/python/pyspark/sql/tests/streaming/test_streaming_datasource_admission_control.py b/python/pyspark/sql/tests/streaming/test_streaming_datasource_admission_control.py
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import time
+import unittest
+
+from pyspark.sql.datasource import DataSource, DataSourceStreamReader
+from pyspark.sql.functions import F
+from pyspark.sql.streaming import StreamTest
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType
+
+
+class RateLimitStreamReader(DataSourceStreamReader):
+    def __init__(self, start, max_rows_per_batch):
+        self._start = start
+        self._max_rows_per_batch = max_rows_per_batch
+        self._next_offset = start
+
+    def initialOffset(self):
+        return str(self._start)
+
+    def latestOffset(self, start, read_limit):
+        max_rows = read_limit.get("maxRows", self._max_rows_per_batch)
+        self._next_offset += max_rows
+        return str(self._next_offset)
+
+    def partitions(self, start, end):
+        return [str(i).encode("utf-8") for i in range(int(start), int(end))]
+
+
+class RateLimitDataSource(DataSource):
+    def __init__(self, options):
+        self._max_rows_per_batch = int(options.get("maxRowsPerBatch", "100"))
+
+    def streamReader(self, schema):
+        return RateLimitStreamReader(0, self._max_rows_per_batch)
+
+
+class BackwardCompatibilityStreamReader(DataSourceStreamReader):
+    def __init__(self, start):
+        self._start = start
+        self._next_offset = start
+
+    def initialOffset(self):
+        return str(self._start)
+
+    def latestOffset(self):
+        self._next_offset += 1
+        return str(self._next_offset)
+
+    def partitions(self, start, end):
+        return [str(i).encode("utf-8") for i in range(int(start), int(end))]
+
+
+class BackwardCompatibilityDataSource(DataSource):
+    def streamReader(self, schema):
+        return BackwardCompatibilityStreamReader(0)
+
+
+class StreamingDataSourceAdmissionControlTests(StreamTest):
+    def test_backward_compatibility(self):
+        df = (
+            self.spark.readStream.format(
+                "org.apache.spark.sql.streaming.test.BackwardCompatibilityDataSource"
+            )
+            .option("includeTimestamp", "true")
+            .load()
+        )
+        self.assertTrue(df.isStreaming)
+
+        q = df.writeStream.queryName("test").format("memory").start()
+        try:
+            time.sleep(5)
+            self.assertTrue(self.spark.table("test").count() > 0)
+        finally:
+            q.stop()
+
+    def test_rate_limit(self):
+        df = (
+            self.spark.readStream.format("org.apache.spark.sql.streaming.test.RateLimitDataSource")
+            .option("maxRowsPerBatch", "5")
+            .load()
+        )
+        self.assertTrue(df.isStreaming)
+
+        q = df.writeStream.queryName("test_rate_limit").format("memory").start()
+        try:
+            time.sleep(5)
+            # The exact count can vary, but it should be a multiple of 5.
+            count = self.spark.table("test_rate_limit").count()
+            self.assertTrue(count > 0)
+            self.assertEqual(count % 5, 0)
+        finally:
+            q.stop()
+
+
+if __name__ == "__main__":
+    from pyspark.sql.tests.streaming.test_streaming_datasource_admission_control import *
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/sql/core/src/main/scala/org/apache.spark/sql/execution/python/streaming/PythonStreamingSourceRunner.scala b/sql/core/src/main/scala/org/apache.spark/sql/execution/python/streaming/PythonStreamingSourceRunner.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/python/PythonMicroBatchStream.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2.python
 import org.apache.spark.SparkEnv
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory}
-import org.apache.spark.sql.connector.read.streaming.{AcceptsLatestSeenOffset, MicroBatchStream, Offset}
+import org.apache.spark.sql.connector.read.streaming.{AcceptsLatestSeenOffset, MicroBatchStream, Offset, ReadLimit, SupportsAdmissionControl}
 import org.apache.spark.sql.execution.datasources.v2.python.PythonMicroBatchStream.nextStreamId
 import org.apache.spark.sql.execution.python.streaming.PythonStreamingSourceRunner
 import org.apache.spark.sql.types.StructType
@@ -32,11 +32,11 @@ class PythonMicroBatchStream(
     ds: PythonDataSourceV2,
     shortName: String,
     outputSchema: StructType,
-    options: CaseInsensitiveStringMap
-  )
-  extends MicroBatchStream
-  with Logging
-  with AcceptsLatestSeenOffset {
+    options: CaseInsensitiveStringMap)
+    extends MicroBatchStream
+    with Logging
+    with AcceptsLatestSeenOffset
+    with SupportsAdmissionControl {
   private def createDataSourceFunc =
     ds.source.createPythonFunction(
       ds.getOrCreateDataSourceInPython(shortName, options, Some(outputSchema)).dataSource)
@@ -55,7 +55,11 @@ class PythonMicroBatchStream(
 
   override def initialOffset(): Offset = PythonStreamingSourceOffset(runner.initialOffset())
 
-  override def latestOffset(): Offset = PythonStreamingSourceOffset(runner.latestOffset())
+  override def latestOffset(): Offset = PythonStreamingSourceOffset(runner.latestOffset(None))
+
+  override def latestOffset(start: Offset, limit: ReadLimit): Offset = {
+    PythonStreamingSourceOffset(runner.latestOffset(Some(start), Some(limit)))
+  }
 
   override def planInputPartitions(start: Offset, end: Offset): Array[InputPartition] = {
     val startOffsetJson = start.asInstanceOf[PythonStreamingSourceOffset].json
@@ -72,7 +76,10 @@ class PythonMicroBatchStream(
       nextBlockId = nextBlockId + 1
       val blockId = PythonStreamBlockId(streamId, nextBlockId)
       SparkEnv.get.blockManager.putIterator(
-        blockId, rows.get, StorageLevel.MEMORY_AND_DISK_SER, true)
+        blockId,
+        rows.get,
+        StorageLevel.MEMORY_AND_DISK_SER,
+        true)
       val partition = PythonStreamingInputPartition(0, partitions.head, Some(blockId))
       cachedInputPartition.foreach(_._3.dropCache())
       cachedInputPartition = Some((startOffsetJson, endOffsetJson, partition))
@@ -94,8 +101,7 @@ class PythonMicroBatchStream(
   }
 
   override def createReaderFactory(): PartitionReaderFactory = {
-    new PythonStreamingPartitionReaderFactory(
-      ds.source, readInfo.func, outputSchema, None, None)
+    new PythonStreamingPartitionReaderFactory(ds.source, readInfo.func, outputSchema, None, None)
   }
 
   override def commit(end: Offset): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/streaming/PythonStreamingSourceRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/streaming/PythonStreamingSourceRunner.scala