apache · jiteshsoni · Nov 16, 2025
diff --git a/python/docs/source/sql_streaming.rst b/python/docs/source/sql_streaming.rst
@@ -0,0 +1,12 @@
+.. See also ``pyspark.sql.sources.DataSource.streamReader``.
+
+The parameter `read_limit` in `latestOffset` provides the read limit for the current batch.
+The implementation can use this information to cap the number of rows returned in the batch.
+For example, if the `read_limit` is `{"maxRows": 1000}`, the data source should not return
+more than 1000 rows. The available read limit types are:
+
+* `maxRows`: the maximum number of rows to return in a batch.
+* `minRows`: the minimum number of rows to return in a batch.
+* `maxBytes`: the maximum size in bytes to return in a batch.
+* `minBytes`: the minimum size in bytes to return in a batch.
+* `allAvailable`: return all available data in a batch.
diff --git a/python/docs/source/tutorial/sql/index.rst b/python/docs/source/tutorial/sql/index.rst
@@ -27,5 +27,6 @@ Spark SQL
    arrow_python_udtf
    python_udtf
    python_data_source
+   sql_streaming
    type_conversions
 
diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
@@ -300,7 +300,7 @@ class Filter(ABC):
 
     +---------------------+--------------------------------------------+
     | SQL filter          | Representation                             |
-    +---------------------+--------------------------------------------+
+    +---------------------+---------------------------------------------+
     | `a.b.c = 1`         | `EqualTo(("a", "b", "c"), 1)`              |
     | `a = 1`             | `EqualTo(("a",), 1)`                       |
     | `a = 'hi'`          | `EqualTo(("a",), "hi")`                    |
@@ -685,56 +685,23 @@ def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["Re
 
 class DataSourceStreamReader(ABC):
     """
-    A base class for streaming data source readers. Data source stream readers are responsible
-    for outputting data from a streaming data source.
+    An interface for streaming data source.
 
     .. versionadded: 4.0.0
     """
 
-    def initialOffset(self) -> dict:
-        """
-        Return the initial offset of the streaming data source.
-        A new streaming query starts reading data from the initial offset.
-        If Spark is restarting an existing query, it will restart from the check-pointed offset
-        rather than the initial one.
-
-        Returns
-        -------
-        dict
-            A dict or recursive dict whose key and value are primitive types, which includes
-            Integer, String and Boolean.
-
-        Examples
-        --------
-        >>> def initialOffset(self):
-        ...     return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}
-        """
-        raise PySparkNotImplementedError(
-            errorClass="NOT_IMPLEMENTED",
-            messageParameters={"feature": "initialOffset"},
-        )
+    def initialOffset(self) -> str:
+        pass
 
-    def latestOffset(self) -> dict:
+    @abstractmethod
+    def latestOffset(self) -> str:
         """
         Returns the most recent offset available.
-
-        Returns
-        -------
-        dict
-            A dict or recursive dict whose key and value are primitive types, which includes
-            Integer, String and Boolean.
-
-        Examples
-        --------
-        >>> def latestOffset(self):
-        ...     return {"parititon-1": {"index": 3, "closed": True}, "partition-2": {"index": 5}}
         """
-        raise PySparkNotImplementedError(
-            errorClass="NOT_IMPLEMENTED",
-            messageParameters={"feature": "latestOffset"},
-        )
+        pass
 
-    def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]:
+    @abstractmethod
+    def partitions(self, start: str, end: str) -> List[bytes]:
         """
         Returns a list of InputPartition given the start and end offsets. Each InputPartition
         represents a data split that can be processed by one Spark task. This may be called with

diff --git a/python/pyspark/sql/datasource_internal.py b/python/pyspark/sql/datasource_internal.py
@@ -19,7 +19,7 @@
 import json
 import copy
 from itertools import chain
-from typing import Iterator, List, Optional, Sequence, Tuple
+from typing import Iterator, List, Optional, Sequence, Tuple, Dict, TYPE_CHECKING
 
 from pyspark.sql.datasource import (
     DataSource,
@@ -77,25 +77,22 @@ class _SimpleStreamReaderWrapper(DataSourceStreamReader):
     replayed by reading data between start and end offset through readBetweenOffsets(start, end).
     """
 
-    def __init__(self, simple_reader: SimpleDataSourceStreamReader):
-        self.simple_reader = simple_reader
-        self.initial_offset: Optional[dict] = None
-        self.current_offset: Optional[dict] = None
-        self.cache: List[PrefetchedCacheEntry] = []
+    def __init__(self, reader: "DataSourceStreamReader"):
+        self.reader = reader
 
     def initialOffset(self) -> dict:
-        if self.initial_offset is None:
-            self.initial_offset = self.simple_reader.initialOffset()
-        return self.initial_offset
-
-    def latestOffset(self) -> dict:
-        # when query start for the first time, use initial offset as the start offset.
-        if self.current_offset is None:
-            self.current_offset = self.initialOffset()
-        (iter, end) = self.simple_reader.read(self.current_offset)
-        self.cache.append(PrefetchedCacheEntry(self.current_offset, end, iter))
-        self.current_offset = end
-        return end
+        return self.reader.initialOffset()
+
+    def latestOffset(self, start: Optional[dict], read_limit: Dict) -> dict:
+        # For backward compatibility, `latestOffset` with two arguments is not an abstract method.
+        # If the user-defined stream reader does not implement that, it will fall back to
+        # the `latestOffset` with no argument.
+        if hasattr(self.reader, "latestOffset") and not isinstance(
+            self.reader, SimpleDataSourceStreamReader
+        ):
+            return self.reader.latestOffset(start, read_limit)
+        else:
+            return self.reader.latestOffset()
 
     def commit(self, end: dict) -> None:
         if self.current_offset is None:

diff --git a/python/pyspark/sql/streaming/python_streaming_source_runner.py b/python/pyspark/sql/streaming/python_streaming_source_runner.py
@@ -21,31 +21,61 @@
 from typing import IO, Iterator, Tuple
 
 from pyspark.accumulators import _accumulatorRegistry
-from pyspark.errors import IllegalArgumentException, PySparkAssertionError
+from pyspark.errors import PySparkRuntimeError
+from pyspark.rdd import PythonRDD, set_python_broadcast
 from pyspark.serializers import (
+    ArrowStreamPandasSerializer,
+    BatchedSerializer,
+    CPickleSerializer,
+    CloudPickleSerializer,
+    FramedSerializer,
+    UTF8Deserializer,
+    UTF8Serializer,
+)
+from pyspark.sql.types import StructType
+from pyspark.util import (
+    _exception_from_last_cause,
+    _get_daemon_build_info,
+    _print_missing_requirements,
+    _print_local_python_versions,
+    _print_current_working_directory,
+    _print_filepath_on_worker,
+    _print_python_path,
+    _print_java_version,
+    _print_py4j_version,
+    _print_python_version,
+    _print_system_path,
+    _print_spark_home,
+    _print_user_envs,
+    _check_python_version,
+    _check_py4j_version,
+    _check_java_version,
+    _check_spark_home,
+    _check_system_path,
+    _check_user_envs,
+    _check_current_working_directory,
+    _check_filepath_on_worker,
+    _check_python_path,
+    _check_python_versions,
+    _check_daemon_build_info,
+)
+from pyspark.worker import (
+    accum_from_bytes,
+    bytearray_to_bytes,
+    get_accumulator_manager,
+    pickleSer,
+    read_command,
     read_int,
-    write_int,
-    write_with_length,
+    read_long,
+    read_with_length,
     SpecialLengths,
+    write_with_length,
+    read_bool,
 )
 from pyspark.sql.datasource import DataSource, DataSourceStreamReader
 from pyspark.sql.datasource_internal import _SimpleStreamReaderWrapper, _streamReader
 from pyspark.sql.pandas.serializers import ArrowStreamSerializer
-from pyspark.sql.types import (
-    _parse_datatype_json_string,
-    StructType,
-)
 from pyspark.sql.worker.plan_data_source_read import records_to_arrow_batches
-from pyspark.util import handle_worker_exception, local_connect_and_auth
-from pyspark.worker_util import (
-    check_python_version,
-    read_command,
-    pickleSer,
-    send_accumulator_updates,
-    setup_memory_limits,
-    setup_spark_files,
-    utf8_deserializer,
-)
 
 INITIAL_OFFSET_FUNC_ID = 884
 LATEST_OFFSET_FUNC_ID = 885
@@ -169,7 +199,13 @@ def main(infile: IO, outfile: IO) -> None:
                 if func_id == INITIAL_OFFSET_FUNC_ID:
                     initial_offset_func(reader, outfile)
                 elif func_id == LATEST_OFFSET_FUNC_ID:
-                    latest_offset_func(reader, outfile)
+                    has_start = read_bool(infile)
+                    if has_start:
+                        start = read_with_length(infile)
+                    else:
+                        start = None
+                    read_limit = json.loads(read_with_length(infile))
+                    write_with_length(reader.latestOffset(start, read_limit))
                 elif func_id == PARTITIONS_FUNC_ID:
                     partitions_func(
                         reader, data_source, schema, max_arrow_batch_size, infile, outfile

diff --git a/python/pyspark/sql/tests/streaming/test_streaming_datasource_admission_control.py b/python/pyspark/sql/tests/streaming/test_streaming_datasource_admission_control.py
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import time
+import unittest
+
+from pyspark.sql.datasource import DataSource, DataSourceStreamReader
+from pyspark.sql.functions import F
+from pyspark.sql.streaming import StreamTest
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType
+
+
+class RateLimitStreamReader(DataSourceStreamReader):
+    def __init__(self, start, max_rows_per_batch):
+        self._start = start
+        self._max_rows_per_batch = max_rows_per_batch
+        self._next_offset = start
+
+    def initialOffset(self):
+        return str(self._start)
+
+    def latestOffset(self, start, read_limit):
+        max_rows = read_limit.get("maxRows", self._max_rows_per_batch)
+        self._next_offset += max_rows
+        return str(self._next_offset)
+
+    def partitions(self, start, end):
+        return [str(i).encode("utf-8") for i in range(int(start), int(end))]
+
+
+class RateLimitDataSource(DataSource):
+    def __init__(self, options):
+        self._max_rows_per_batch = int(options.get("maxRowsPerBatch", "100"))
+
+    def streamReader(self, schema):
+        return RateLimitStreamReader(0, self._max_rows_per_batch)
+
+
+class BackwardCompatibilityStreamReader(DataSourceStreamReader):
+    def __init__(self, start):
+        self._start = start
+        self._next_offset = start
+
+    def initialOffset(self):
+        return str(self._start)
+
+    def latestOffset(self):
+        self._next_offset += 1
+        return str(self._next_offset)
+
+    def partitions(self, start, end):
+        return [str(i).encode("utf-8") for i in range(int(start), int(end))]
+
+
+class BackwardCompatibilityDataSource(DataSource):
+    def streamReader(self, schema):
+        return BackwardCompatibilityStreamReader(0)
+
+
+class StreamingDataSourceAdmissionControlTests(StreamTest):
+    def test_backward_compatibility(self):
+        df = (
+            self.spark.readStream.format(
+                "org.apache.spark.sql.streaming.test.BackwardCompatibilityDataSource"
+            )
+            .option("includeTimestamp", "true")
+            .load()
+        )
+        self.assertTrue(df.isStreaming)
+
+        q = df.writeStream.queryName("test").format("memory").start()
+        try:
+            time.sleep(5)
+            self.assertTrue(self.spark.table("test").count() > 0)
+        finally:
+            q.stop()
+
+    def test_rate_limit(self):
+        df = (
+            self.spark.readStream.format("org.apache.spark.sql.streaming.test.RateLimitDataSource")
+            .option("maxRowsPerBatch", "5")
+            .load()
+        )
+        self.assertTrue(df.isStreaming)
+
+        q = df.writeStream.queryName("test_rate_limit").format("memory").start()
+        try:
+            time.sleep(5)
+            # The exact count can vary, but it should be a multiple of 5.
+            count = self.spark.table("test_rate_limit").count()
+            self.assertTrue(count > 0)
+            self.assertEqual(count % 5, 0)
+        finally:
+            q.stop()
+
+
+if __name__ == "__main__":
+    from pyspark.sql.tests.streaming.test_streaming_datasource_admission_control import *
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)