@@ -39,12 +39,16 @@ namespace sparrow_ipc
3939 * The serialization follows the Arrow IPC stream format where each record batch message
4040 * consists of a metadata section followed by a body section containing the actual data.
4141 *
42- * @param record_batch The sparrow record batch to be serialized
43- * @param stream The output stream where the serialized record batch will be written
44- * @param compression The compression type to use when serializing
42+ * @param record_batch The sparrow record batch to be serialized.
43+ * @param stream The output stream where the serialized record batch will be written.
44+ * @param compression Optional: The compression type to use when serializing.
45+ * @param cache Optional: A cache for compressed buffers to avoid recompression if compression is enabled.
46+ * If compression is given, cache should be set as well.
4547 */
4648 SPARROW_IPC_API void
47- serialize_record_batch (const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional<CompressionType> compression);
49+ serialize_record_batch (const sparrow::record_batch& record_batch, any_output_stream& stream,
50+ std::optional<CompressionType> compression,
51+ std::optional<std::reference_wrapper<CompressionCache>> cache);
4852
4953 /* *
5054 * @brief Calculates the total serialized size of a schema message.
@@ -73,28 +77,36 @@ namespace sparrow_ipc
7377 * - Padding to 8-byte alignment after metadata
7478 * - Body data with 8-byte alignment between buffers
7579 *
76- * @param record_batch The record batch to be measured
77- * @param compression The compression type to use when serializing
78- * @return The total size in bytes that the serialized record batch would occupy
80+ * @param record_batch The record batch to be measured.
81+ * @param compression Optional: The compression type to use when serializing.
82+ * @param cache Optional: A cache to store and retrieve compressed buffer sizes, avoiding recompression.
83+ * If compression is given, cache should be set as well.
84+ * @return The total size in bytes that the serialized record batch would occupy.
7985 */
8086 [[nodiscard]] SPARROW_IPC_API std::size_t
81- calculate_record_batch_message_size (const sparrow::record_batch& record_batch, std::optional<CompressionType> compression = std::nullopt );
87+ calculate_record_batch_message_size (const sparrow::record_batch& record_batch,
88+ std::optional<CompressionType> compression = std::nullopt ,
89+ std::optional<std::reference_wrapper<CompressionCache>> cache = std::nullopt );
8290
8391 /* *
8492 * @brief Calculates the total serialized size for a collection of record batches.
8593 *
8694 * This function computes the complete size that would be produced by serializing
8795 * a schema message followed by all record batch messages in the collection.
8896 *
89- * @tparam R Range type containing sparrow::record_batch objects
90- * @param record_batches Collection of record batches to be measured
91- * @param compression The compression type to use when serializing
92- * @return The total size in bytes for the complete serialized output
93- * @throws std::invalid_argument if record batches have inconsistent schemas
97+ * @tparam R Range type containing sparrow::record_batch objects.
98+ * @param record_batches Collection of record batches to be measured.
99+ * @param compression Optional: The compression type to use when serializing.
100+ * @param cache Optional: A cache to store and retrieve compressed buffer sizes, avoiding recompression.
101+ * If compression is given, cache should be set as well.
102+ * @return The total size in bytes for the complete serialized output.
103+ * @throws std::invalid_argument if record batches have inconsistent schemas.
94104 */
95105 template <std::ranges::input_range R>
96106 requires std::same_as<std::ranges::range_value_t <R>, sparrow::record_batch>
97- [[nodiscard]] std::size_t calculate_total_serialized_size (const R& record_batches, std::optional<CompressionType> compression = std::nullopt )
107+ [[nodiscard]] std::size_t calculate_total_serialized_size (const R& record_batches,
108+ std::optional<CompressionType> compression = std::nullopt ,
109+ std::optional<std::reference_wrapper<CompressionCache>> cache = std::nullopt )
98110 {
99111 if (record_batches.empty ())
100112 {
@@ -113,7 +125,7 @@ namespace sparrow_ipc
113125 // Calculate record batch message sizes
114126 for (const auto & record_batch : record_batches)
115127 {
116- total_size += calculate_record_batch_message_size (record_batch, compression);
128+ total_size += calculate_record_batch_message_size (record_batch, compression, cache );
117129 }
118130
119131 return total_size;
@@ -131,11 +143,16 @@ namespace sparrow_ipc
131143 * 8-byte boundary, which is typically required for efficient memory access and Arrow
132144 * format compliance.
133145 *
134- * @param arrow_proxy The arrow proxy containing buffers and potential child proxies to serialize
135- * @param stream The output stream where the serialized body data will be written
136- * @param compression The compression type to use when serializing
146+ * @param arrow_proxy The arrow proxy containing buffers and potential child proxies to serialize.
147+ * @param stream The output stream where the serialized body data will be written.
148+ * @param compression Optional: The compression type to use when serializing.
149+ * @param cache Optional: A cache for compressed buffers to avoid recompression if compression is enabled.
150+ * If compression is given, cache should be set as well.
151+ * @throws std::invalid_argument if compression is given but not cache.
137152 */
138- SPARROW_IPC_API void fill_body (const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream, std::optional<CompressionType> compression = std::nullopt );
153+ SPARROW_IPC_API void fill_body (const sparrow::arrow_proxy& arrow_proxy, any_output_stream& stream,
154+ std::optional<CompressionType> compression = std::nullopt ,
155+ std::optional<std::reference_wrapper<CompressionCache>> cache = std::nullopt );
139156
140157 /* *
141158 * @brief Generates a serialized body from a record batch.
@@ -144,11 +161,15 @@ namespace sparrow_ipc
144161 * extracts their Arrow proxy representations, and serializes them into a
145162 * single byte vector that forms the body of the serialized data.
146163 *
147- * @param record_batch The record batch containing columns to be serialized
148- * @param stream The output stream where the serialized body will be written
149- * @param compression The compression type to use when serializing
164+ * @param record_batch The record batch containing columns to be serialized.
165+ * @param stream The output stream where the serialized body will be written.
166+ * @param compression Optional: The compression type to use when serializing.
167+ * @param cache Optional: A cache for compressed buffers to avoid recompression if compression is enabled.
168+ * If compression is given, cache should be set as well.
150169 */
151- SPARROW_IPC_API void generate_body (const sparrow::record_batch& record_batch, any_output_stream& stream, std::optional<CompressionType> compression = std::nullopt );
170+ SPARROW_IPC_API void generate_body (const sparrow::record_batch& record_batch, any_output_stream& stream,
171+ std::optional<CompressionType> compression = std::nullopt ,
172+ std::optional<std::reference_wrapper<CompressionCache>> cache = std::nullopt );
152173
153174 SPARROW_IPC_API std::vector<sparrow::data_type> get_column_dtypes (const sparrow::record_batch& rb);
154175}
0 commit comments