From 16e07a99b5b62108a5def09a3e6fa3a3268db817 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:56:10 +0100 Subject: [PATCH 01/48] Combine create_normal_table and save_normal_table_metadata --- .../modelardb_storage/src/data_folder/mod.rs | 79 ++++++++++--------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 2d4ec8cd..50ec0fa4 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -563,7 +563,7 @@ impl DataFolder { table_name: &str, schema: &Schema, ) -> Result { - self.create_table( + self.create_delta_lake_table( table_name, schema, &[], @@ -579,21 +579,40 @@ impl DataFolder { } /// Create a Delta Lake table for a normal table with `table_name` and `schema` if it does not - /// already exist. If the normal table could not be created, e.g., because it already exists, + /// already exist and save the table metadata to the `normal_table_metadata` table. If the + /// normal table could not be created, e.g., because it already exists, /// [`ModelarDbStorageError`] is returned. pub async fn create_normal_table( &self, table_name: &str, schema: &Schema, ) -> Result { - self.create_table( - table_name, - schema, - &[], - self.location_of_table(table_name), - SaveMode::ErrorIfExists, + let delta_table = self + .create_delta_lake_table( + table_name, + schema, + &[], + self.location_of_table(table_name), + SaveMode::ErrorIfExists, + ) + .await?; + + self.save_normal_table_metadata(table_name).await?; + + Ok(delta_table) + } + + /// Save the created normal table to the Delta Lake. This consists of adding a row to the + /// `normal_table_metadata` table with the `name` of the table. If the normal table metadata was + /// saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. + async fn save_normal_table_metadata(&self, name: &str) -> Result<()> { + self.write_columns_to_metadata_table( + "normal_table_metadata", + vec![Arc::new(StringArray::from(vec![name]))], ) - .await + .await?; + + Ok(()) } /// Create a Delta Lake table for a time series table with `time_series_table_metadata` if it @@ -603,7 +622,7 @@ impl DataFolder { &self, time_series_table_metadata: &TimeSeriesTableMetadata, ) -> Result { - self.create_table( + self.create_delta_lake_table( &time_series_table_metadata.name, &time_series_table_metadata.compressed_schema, &[FIELD_COLUMN.to_owned()], @@ -621,7 +640,7 @@ impl DataFolder { /// Create a Delta Lake table with `table_name`, `schema`, and `partition_columns` if it does /// not already exist. Returns [`DeltaTable`] if the table could be created and /// [`ModelarDbStorageError`] if it could not. - async fn create_table( + async fn create_delta_lake_table( &self, table_name: &str, schema: &Schema, @@ -744,19 +763,6 @@ impl DataFolder { Ok(()) } - /// Save the created normal table to the Delta Lake. This consists of adding a row to the - /// `normal_table_metadata` table with the `name` of the table. If the normal table metadata was - /// saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. - pub async fn save_normal_table_metadata(&self, name: &str) -> Result<()> { - self.write_columns_to_metadata_table( - "normal_table_metadata", - vec![Arc::new(StringArray::from(vec![name]))], - ) - .await?; - - Ok(()) - } - /// Save the created time series table to the Delta Lake. This includes adding a row to the /// `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` /// table for each field column. @@ -1320,7 +1326,7 @@ mod tests { #[tokio::test] async fn test_normal_table_is_normal_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; assert!(data_folder.is_normal_table("normal_table_1").await.unwrap()); } @@ -1348,7 +1354,7 @@ mod tests { #[tokio::test] async fn test_normal_table_is_not_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; assert!( !data_folder .is_time_series_table("normal_table_1") @@ -1359,7 +1365,7 @@ mod tests { #[tokio::test] async fn test_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; let time_series_table_metadata = test::time_series_table_metadata(); data_folder @@ -1380,7 +1386,7 @@ mod tests { #[tokio::test] async fn test_normal_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; let normal_table_names = data_folder.normal_table_names().await.unwrap(); assert_eq!(normal_table_names, vec!["normal_table_2", "normal_table_1"]); @@ -1395,10 +1401,10 @@ mod tests { } #[tokio::test] - async fn test_save_normal_table_metadata() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + async fn test_create_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - // Retrieve the normal table from the Delta Lake. + // Retrieve the normal table metadata from the Delta Lake. let sql = "SELECT table_name FROM metadata.normal_table_metadata ORDER BY table_name"; let batch = sql_and_concat(&data_folder.session_context, sql) .await @@ -1461,7 +1467,7 @@ mod tests { #[tokio::test] async fn test_drop_normal_table_metadata() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; data_folder .drop_table_metadata("normal_table_2") @@ -1505,7 +1511,7 @@ mod tests { #[tokio::test] async fn test_drop_table_metadata_for_missing_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; assert!( data_folder @@ -1515,17 +1521,18 @@ mod tests { ); } - async fn create_data_folder_and_save_normal_tables() -> (TempDir, DataFolder) { + async fn create_data_folder_and_create_normal_tables() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let normal_table_schema = test::normal_table_schema(); data_folder - .save_normal_table_metadata("normal_table_1") + .create_normal_table("normal_table_1", &normal_table_schema) .await .unwrap(); data_folder - .save_normal_table_metadata("normal_table_2") + .create_normal_table("normal_table_2", &normal_table_schema) .await .unwrap(); From 6346568fdcc1e0e5039634a328158daceb942c9f Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:56:43 +0100 Subject: [PATCH 02/48] Remove previous calls to save_normal_table_metadata --- crates/modelardb_embedded/src/operations/data_folder.rs | 3 --- crates/modelardb_server/src/cluster.rs | 9 --------- crates/modelardb_server/src/context.rs | 8 +------- crates/modelardb_server/src/storage/data_transfer.rs | 5 ----- 4 files changed, 1 insertion(+), 24 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index cf056765..ad62e61c 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -125,9 +125,6 @@ impl Operations for DataFolder { match table_type { TableType::NormalTable(schema) => { let delta_table = self.create_normal_table(table_name, &schema).await?; - - self.save_normal_table_metadata(table_name).await?; - let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_normal_table( diff --git a/crates/modelardb_server/src/cluster.rs b/crates/modelardb_server/src/cluster.rs index 8589437d..20a52083 100644 --- a/crates/modelardb_server/src/cluster.rs +++ b/crates/modelardb_server/src/cluster.rs @@ -147,10 +147,6 @@ impl Cluster { .create_normal_table(table_name, schema) .await?; - self.remote_data_folder - .save_normal_table_metadata(table_name) - .await?; - // Create the normal table in each peer node. let protobuf_bytes = modelardb_types::flight::encode_and_serialize_normal_table_metadata( table_name, schema, @@ -663,11 +659,6 @@ mod test { .create_normal_table(table_name, &schema) .await .unwrap(); - - data_folder - .save_normal_table_metadata(table_name) - .await - .unwrap(); } /// Create a time series table named `table_name` with a field column named `column_name` in diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index f21e7439..2ce1300a 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -78,7 +78,7 @@ impl Context { table_name: &str, schema: &Schema, ) -> Result<()> { - // Create an empty Delta Lake table. + // Create an empty Delta Lake table and save the normal table metadata to the Delta Lake. self.data_folders .local_data_folder .create_normal_table(table_name, schema) @@ -87,12 +87,6 @@ impl Context { // Register the normal table with Apache DataFusion. self.register_normal_table(table_name).await?; - // Persist the new normal table to the Delta Lake. - self.data_folders - .local_data_folder - .save_normal_table_metadata(table_name) - .await?; - info!("Created normal table '{}'.", table_name); Ok(()) diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index b61b1557..e380ff68 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -474,11 +474,6 @@ mod tests { .await .unwrap(); - local_data_folder - .save_normal_table_metadata(NORMAL_TABLE_NAME) - .await - .unwrap(); - // Create a time series table. let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder From 25ac67bba0d6c4cafbe7c82ed61b961a9af5458f Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:15:14 +0100 Subject: [PATCH 03/48] Combine create_time_series_table and save_time_series_table_metadata --- .../modelardb_storage/src/data_folder/mod.rs | 185 +++++++++--------- 1 file changed, 96 insertions(+), 89 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 50ec0fa4..caf31919 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -616,20 +616,27 @@ impl DataFolder { } /// Create a Delta Lake table for a time series table with `time_series_table_metadata` if it - /// does not already exist. Returns [`DeltaTable`] if the table could be created and - /// [`ModelarDbStorageError`] if it could not. + /// does not already exist and save the table metadata to the `time_series_table_metadata` + /// table. Returns [`DeltaTable`] if the table could be created and [`ModelarDbStorageError`] + /// if it could not. pub async fn create_time_series_table( &self, time_series_table_metadata: &TimeSeriesTableMetadata, ) -> Result { - self.create_delta_lake_table( - &time_series_table_metadata.name, - &time_series_table_metadata.compressed_schema, - &[FIELD_COLUMN.to_owned()], - self.location_of_table(&time_series_table_metadata.name), - SaveMode::ErrorIfExists, - ) - .await + let delta_table = self + .create_delta_lake_table( + &time_series_table_metadata.name, + &time_series_table_metadata.compressed_schema, + &[FIELD_COLUMN.to_owned()], + self.location_of_table(&time_series_table_metadata.name), + SaveMode::ErrorIfExists, + ) + .await?; + + self.save_time_series_table_metadata(time_series_table_metadata) + .await?; + + Ok(delta_table) } /// Return the location of the table with `table_name`. @@ -637,6 +644,85 @@ impl DataFolder { format!("{}/{TABLE_FOLDER}/{table_name}", self.location) } + /// Save the created time series table to the Delta Lake. This includes adding a row to the + /// `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` + /// table for each field column. + async fn save_time_series_table_metadata( + &self, + time_series_table_metadata: &TimeSeriesTableMetadata, + ) -> Result<()> { + // Convert the query schema to bytes, so it can be saved in the Delta Lake. + let query_schema_bytes = + try_convert_schema_to_bytes(&time_series_table_metadata.query_schema)?; + + // Add a new row in the time_series_table_metadata table to persist the time series table. + self.write_columns_to_metadata_table( + "time_series_table_metadata", + vec![ + Arc::new(StringArray::from(vec![ + time_series_table_metadata.name.clone(), + ])), + Arc::new(BinaryArray::from_vec(vec![&query_schema_bytes])), + ], + ) + .await?; + + // Add a row for each field column to the time_series_table_field_columns table. + for (query_schema_index, field) in time_series_table_metadata + .query_schema + .fields() + .iter() + .enumerate() + { + if field.data_type() == &ArrowValue::DATA_TYPE { + // Convert the generated column expression to bytes, if it exists. + let maybe_generated_column_expr = match time_series_table_metadata + .generated_columns + .get(query_schema_index) + { + Some(Some(generated_column)) => { + Some(generated_column.expr.to_bytes()?.to_vec()) + } + _ => None, + }; + + // error_bounds matches schema and not query_schema to simplify looking up the error + // bound during ingestion as it occurs far more often than creation of time series tables. + let (error_bound_value, error_bound_is_relative) = if let Ok(schema_index) = + time_series_table_metadata.schema.index_of(field.name()) + { + match time_series_table_metadata.error_bounds[schema_index] { + ErrorBound::Absolute(value) => (value, false), + ErrorBound::Relative(value) => (value, true), + ErrorBound::Lossless => (0.0, false), + } + } else { + (0.0, false) + }; + + // query_schema_index is simply cast as a time series table contains at most 32767 columns. + self.write_columns_to_metadata_table( + "time_series_table_field_columns", + vec![ + Arc::new(StringArray::from(vec![ + time_series_table_metadata.name.clone(), + ])), + Arc::new(StringArray::from(vec![field.name().clone()])), + Arc::new(Int16Array::from(vec![query_schema_index as i16])), + Arc::new(Float32Array::from(vec![error_bound_value])), + Arc::new(BooleanArray::from(vec![error_bound_is_relative])), + Arc::new(BinaryArray::from_opt_vec(vec![ + maybe_generated_column_expr.as_deref(), + ])), + ], + ) + .await?; + } + } + + Ok(()) + } + /// Create a Delta Lake table with `table_name`, `schema`, and `partition_columns` if it does /// not already exist. Returns [`DeltaTable`] if the table could be created and /// [`ModelarDbStorageError`] if it could not. @@ -763,85 +849,6 @@ impl DataFolder { Ok(()) } - /// Save the created time series table to the Delta Lake. This includes adding a row to the - /// `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` - /// table for each field column. - pub async fn save_time_series_table_metadata( - &self, - time_series_table_metadata: &TimeSeriesTableMetadata, - ) -> Result<()> { - // Convert the query schema to bytes, so it can be saved in the Delta Lake. - let query_schema_bytes = - try_convert_schema_to_bytes(&time_series_table_metadata.query_schema)?; - - // Add a new row in the time_series_table_metadata table to persist the time series table. - self.write_columns_to_metadata_table( - "time_series_table_metadata", - vec![ - Arc::new(StringArray::from(vec![ - time_series_table_metadata.name.clone(), - ])), - Arc::new(BinaryArray::from_vec(vec![&query_schema_bytes])), - ], - ) - .await?; - - // Add a row for each field column to the time_series_table_field_columns table. - for (query_schema_index, field) in time_series_table_metadata - .query_schema - .fields() - .iter() - .enumerate() - { - if field.data_type() == &ArrowValue::DATA_TYPE { - // Convert the generated column expression to bytes, if it exists. - let maybe_generated_column_expr = match time_series_table_metadata - .generated_columns - .get(query_schema_index) - { - Some(Some(generated_column)) => { - Some(generated_column.expr.to_bytes()?.to_vec()) - } - _ => None, - }; - - // error_bounds matches schema and not query_schema to simplify looking up the error - // bound during ingestion as it occurs far more often than creation of time series tables. - let (error_bound_value, error_bound_is_relative) = if let Ok(schema_index) = - time_series_table_metadata.schema.index_of(field.name()) - { - match time_series_table_metadata.error_bounds[schema_index] { - ErrorBound::Absolute(value) => (value, false), - ErrorBound::Relative(value) => (value, true), - ErrorBound::Lossless => (0.0, false), - } - } else { - (0.0, false) - }; - - // query_schema_index is simply cast as a time series table contains at most 32767 columns. - self.write_columns_to_metadata_table( - "time_series_table_field_columns", - vec![ - Arc::new(StringArray::from(vec![ - time_series_table_metadata.name.clone(), - ])), - Arc::new(StringArray::from(vec![field.name().clone()])), - Arc::new(Int16Array::from(vec![query_schema_index as i16])), - Arc::new(Float32Array::from(vec![error_bound_value])), - Arc::new(BooleanArray::from(vec![error_bound_is_relative])), - Arc::new(BinaryArray::from_opt_vec(vec![ - maybe_generated_column_expr.as_deref(), - ])), - ], - ) - .await?; - } - } - - Ok(()) - } - /// Write `columns` to a Delta Lake table with `table_name`. Returns an updated [`DeltaTable`] /// version if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. pub async fn write_columns_to_metadata_table( From 27c3e4302b142e1cbf9e0714661eb54d4c32336d Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:16:47 +0100 Subject: [PATCH 04/48] Fix tests after combining time series table methods --- .../modelardb_storage/src/data_folder/mod.rs | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index caf31919..938aba1c 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1339,7 +1339,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_is_not_normal_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; assert!( !data_folder .is_normal_table(test::TIME_SERIES_TABLE_NAME) @@ -1350,7 +1350,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_is_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; assert!( data_folder .is_time_series_table(test::TIME_SERIES_TABLE_NAME) @@ -1376,7 +1376,7 @@ mod tests { let time_series_table_metadata = test::time_series_table_metadata(); data_folder - .save_time_series_table_metadata(&time_series_table_metadata) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); @@ -1401,7 +1401,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_names = data_folder.time_series_table_names().await.unwrap(); assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); @@ -1411,6 +1411,8 @@ mod tests { async fn test_create_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + assert!(data_folder.delta_table("normal_table_1").await.is_ok()); + // Retrieve the normal table metadata from the Delta Lake. let sql = "SELECT table_name FROM metadata.normal_table_metadata ORDER BY table_name"; let batch = sql_and_concat(&data_folder.session_context, sql) @@ -1424,8 +1426,15 @@ mod tests { } #[tokio::test] - async fn test_save_time_series_table_metadata() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + async fn test_create_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + assert!( + data_folder + .delta_table(test::TIME_SERIES_TABLE_NAME) + .await + .is_ok() + ); // Check that a row has been added to the time_series_table_metadata table. let sql = "SELECT table_name, query_schema FROM metadata.time_series_table_metadata"; @@ -1492,7 +1501,7 @@ mod tests { #[tokio::test] async fn test_drop_time_series_table_metadata() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; data_folder .drop_table_metadata(test::TIME_SERIES_TABLE_NAME) @@ -1548,7 +1557,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_metadata() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_metadata = data_folder.time_series_table_metadata().await.unwrap(); @@ -1560,7 +1569,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_metadata_for_existing_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table(test::TIME_SERIES_TABLE_NAME) @@ -1575,7 +1584,7 @@ mod tests { #[tokio::test] async fn test_time_series_table_metadata_for_missing_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table("missing_table") @@ -1586,7 +1595,7 @@ mod tests { #[tokio::test] async fn test_error_bound() { - let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let error_bounds = data_folder .error_bounds(test::TIME_SERIES_TABLE_NAME, 4) @@ -1643,7 +1652,7 @@ mod tests { .unwrap(); data_folder - .save_time_series_table_metadata(&time_series_table_metadata) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); @@ -1671,14 +1680,13 @@ mod tests { ); } - async fn create_data_folder_and_save_time_series_table() -> (TempDir, DataFolder) { + async fn create_data_folder_and_create_time_series_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); - // Save a time series table to the Delta Lake. let time_series_table_metadata = test::time_series_table_metadata(); data_folder - .save_time_series_table_metadata(&time_series_table_metadata) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); From 8e84d74849fe79b9e270ac6ebe730568a463cf38 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:18:06 +0100 Subject: [PATCH 05/48] Remove previous calls to save_time_series_table_metadata --- .../src/operations/data_folder.rs | 3 --- crates/modelardb_server/src/cluster.rs | 9 --------- crates/modelardb_server/src/context.rs | 8 +------- .../src/storage/compressed_data_manager.rs | 15 ++------------- .../modelardb_server/src/storage/data_transfer.rs | 5 ----- .../src/storage/uncompressed_data_manager.rs | 2 +- 6 files changed, 4 insertions(+), 38 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index ad62e61c..217cd599 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -146,9 +146,6 @@ impl Operations for DataFolder { .create_time_series_table(&time_series_table_metadata) .await?; - self.save_time_series_table_metadata(&time_series_table_metadata) - .await?; - let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_time_series_table( diff --git a/crates/modelardb_server/src/cluster.rs b/crates/modelardb_server/src/cluster.rs index 20a52083..b98b4889 100644 --- a/crates/modelardb_server/src/cluster.rs +++ b/crates/modelardb_server/src/cluster.rs @@ -174,10 +174,6 @@ impl Cluster { .create_time_series_table(time_series_table_metadata) .await?; - self.remote_data_folder - .save_time_series_table_metadata(time_series_table_metadata) - .await?; - // Create the time series table in each peer node. let protobuf_bytes = modelardb_types::flight::encode_and_serialize_time_series_table_metadata( @@ -685,11 +681,6 @@ mod test { .create_time_series_table(&time_series_table_metadata) .await .unwrap(); - - data_folder - .save_time_series_table_metadata(&time_series_table_metadata) - .await - .unwrap(); } /// Call [`Cluster::retrieve_and_create_tables`] if the [`ClusterMode`] of `context` is diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 2ce1300a..8de567c4 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -113,7 +113,7 @@ impl Context { &self, time_series_table_metadata: &TimeSeriesTableMetadata, ) -> Result<()> { - // Create an empty Delta Lake table. + // Create an empty Delta Lake table and save the time series table metadata to the Delta Lake. self.data_folders .local_data_folder .create_time_series_table(time_series_table_metadata) @@ -123,12 +123,6 @@ impl Context { self.register_time_series_table(Arc::new(time_series_table_metadata.clone())) .await?; - // Persist the new time series table to the Delta Lake. - self.data_folders - .local_data_folder - .save_time_series_table_metadata(time_series_table_metadata) - .await?; - info!( "Created time series table '{}'.", time_series_table_metadata.name diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index e8667cab..88931d12 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -388,7 +388,7 @@ mod tests { let local_data_folder = data_manager.local_data_folder.clone(); let mut delta_table = local_data_folder - .create_time_series_table(&table::time_series_table_metadata()) + .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -443,14 +443,8 @@ mod tests { #[tokio::test] async fn test_remaining_memory_incremented_when_saving_compressed_segments() { let (_temp_dir, data_manager) = create_compressed_data_manager().await; - let local_data_folder = data_manager.local_data_folder.clone(); let segments = compressed_segments_record_batch(); - local_data_folder - .create_time_series_table(&segments.time_series_table_metadata) - .await - .unwrap(); - data_manager .insert_compressed_segments(segments.clone()) .await @@ -497,14 +491,9 @@ mod tests { #[tokio::test] async fn test_decrease_compressed_remaining_memory_in_bytes() { let (_temp_dir, data_manager) = create_compressed_data_manager().await; - let local_data_folder = data_manager.local_data_folder.clone(); // Insert data that should be saved when the remaining memory is decreased. let segments = compressed_segments_record_batch(); - local_data_folder - .create_time_series_table(&segments.time_series_table_metadata) - .await - .unwrap(); data_manager .insert_compressed_segments(segments) .await @@ -559,7 +548,7 @@ mod tests { let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .save_time_series_table_metadata(&time_series_table_metadata) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index e380ff68..31d8471f 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -481,11 +481,6 @@ mod tests { .await .unwrap(); - local_data_folder - .save_time_series_table_metadata(&time_series_table_metadata) - .await - .unwrap(); - (temp_dir, local_data_folder) } diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index a7274e81..c00800c7 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -1285,7 +1285,7 @@ mod tests { let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .save_time_series_table_metadata(&time_series_table_metadata) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); From f41728aba9e286060161544de52719e25d867a8c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:33:56 +0100 Subject: [PATCH 06/48] Combine drop_table and drop_table_metadata --- .../modelardb_storage/src/data_folder/mod.rs | 97 ++++++++----------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 938aba1c..a9920bf0 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -778,16 +778,52 @@ impl DataFolder { self.delete_table_files(&table_path).await } - /// Drop the Delta Lake table with `table_name` from the Delta Lake by deleting every file - /// related to the table. The table folder cannot be deleted directly since folders do not exist - /// in object stores and therefore cannot be operated upon. If the table was dropped - /// successfully, the paths to the deleted files are returned, otherwise a - /// [`ModelarDbStorageError`] is returned. + /// Drop the Delta Lake table with `table_name` from the Delta Lake by dropping the table + /// metadata and deleting every file related to the table. The table folder cannot be deleted + /// directly since folders do not exist in object stores and therefore cannot be operated upon. + /// If the table was dropped successfully, the paths to the deleted files are returned, + /// otherwise a [`ModelarDbStorageError`] is returned. pub async fn drop_table(&self, table_name: &str) -> Result> { + self.drop_table_metadata(table_name).await?; + let table_path = format!("{TABLE_FOLDER}/{table_name}"); self.delete_table_files(&table_path).await } + /// Depending on the type of the table with `table_name`, drop either the normal table metadata + /// or the time series table metadata from the Delta Lake. If the table does not exist or the + /// metadata could not be dropped, [`ModelarDbStorageError`] is returned. + async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { + if self.is_normal_table(table_name).await? { + let delta_table = self.metadata_delta_table("normal_table_metadata").await?; + + delta_table + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + } else if self.is_time_series_table(table_name).await? { + // Delete the table metadata from the time_series_table_metadata table. + self.metadata_delta_table("time_series_table_metadata") + .await? + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + + // Delete the column metadata from the time_series_table_field_columns table. + self.metadata_delta_table("time_series_table_field_columns") + .await? + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + } else { + return Err(ModelarDbStorageError::InvalidArgument(format!( + "Table with name '{table_name}' does not exist." + ))); + } + + Ok(()) + } + /// Delete all files in the folder at `table_path` using bulk operations if available. If the /// files were deleted successfully, the paths to the deleted files are returned. async fn delete_table_files(&self, table_path: &str) -> Result> { @@ -908,57 +944,6 @@ impl DataFolder { } } - /// Depending on the type of the table with `table_name`, drop either the normal table metadata - /// or the time series table metadata from the Delta Lake. If the table does not exist or the - /// metadata could not be dropped, [`ModelarDbStorageError`] is returned. - pub async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { - if self.is_normal_table(table_name).await? { - self.drop_normal_table_metadata(table_name).await - } else if self.is_time_series_table(table_name).await? { - self.drop_time_series_table_metadata(table_name).await - } else { - Err(ModelarDbStorageError::InvalidArgument(format!( - "Table with name '{table_name}' does not exist." - ))) - } - } - - /// Drop the metadata for the normal table with `table_name` from the `normal_table_metadata` - /// table in the Delta Lake. If the metadata could not be dropped, [`ModelarDbStorageError`] is - /// returned. - async fn drop_normal_table_metadata(&self, table_name: &str) -> Result<()> { - let delta_table = self.metadata_delta_table("normal_table_metadata").await?; - - delta_table - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - Ok(()) - } - - /// Drop the metadata for the time series table with `table_name` from the Delta Lake. This - /// includes deleting a row from the `time_series_table_metadata` table and deleting a row from - /// the `time_series_table_field_columns` table for each field column. If the metadata could not - /// be dropped, [`ModelarDbStorageError`] is returned. - async fn drop_time_series_table_metadata(&self, table_name: &str) -> Result<()> { - // Delete the table metadata from the time_series_table_metadata table. - self.metadata_delta_table("time_series_table_metadata") - .await? - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - // Delete the column metadata from the time_series_table_field_columns table. - self.metadata_delta_table("time_series_table_field_columns") - .await? - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - Ok(()) - } - /// Return the [`TimeSeriesTableMetadata`] of each time series table currently in the metadata /// Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, /// [`ModelarDbStorageError`] is returned. From a38f3b43da0fbe9a4b5145a068f3d73cb65b909c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:34:57 +0100 Subject: [PATCH 07/48] Fix tests after combining drop table methods --- .../modelardb_storage/src/data_folder/mod.rs | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index a9920bf0..2f9d602b 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1467,13 +1467,12 @@ mod tests { } #[tokio::test] - async fn test_drop_normal_table_metadata() { + async fn test_drop_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - data_folder - .drop_table_metadata("normal_table_2") - .await - .unwrap(); + data_folder.drop_table("normal_table_2").await.unwrap(); + + assert!(data_folder.delta_table("normal_table_2").await.is_err()); // Verify that normal_table_2 was deleted from the normal_table_metadata table. let sql = "SELECT table_name FROM metadata.normal_table_metadata"; @@ -1485,14 +1484,21 @@ mod tests { } #[tokio::test] - async fn test_drop_time_series_table_metadata() { + async fn test_drop_time_series_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; data_folder - .drop_table_metadata(test::TIME_SERIES_TABLE_NAME) + .drop_table(test::TIME_SERIES_TABLE_NAME) .await .unwrap(); + assert!( + data_folder + .delta_table(test::TIME_SERIES_TABLE_NAME) + .await + .is_err() + ); + // Verify that the time series table was deleted from the time_series_table_metadata table. let sql = "SELECT table_name FROM metadata.time_series_table_metadata"; let batch = sql_and_concat(&data_folder.session_context, sql) @@ -1511,12 +1517,12 @@ mod tests { } #[tokio::test] - async fn test_drop_table_metadata_for_missing_table() { + async fn test_drop_missing_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; assert!( data_folder - .drop_table_metadata("missing_table") + .drop_table("missing_table") .await .is_err() ); From fed3e77865dc2ae622bdb392bcb2efd70ca433ff Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:35:31 +0100 Subject: [PATCH 08/48] Remove previous calls to drop_table_metadata --- crates/modelardb_embedded/src/operations/data_folder.rs | 3 --- crates/modelardb_server/src/cluster.rs | 4 ---- crates/modelardb_server/src/context.rs | 6 ------ 3 files changed, 13 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 217cd599..7f9a5c7b 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -511,9 +511,6 @@ impl Operations for DataFolder { // Drop the table from the Apache Arrow DataFusion session. self.session_context().deregister_table(table_name)?; - // Delete the table metadata from the Delta Lake. - self.drop_table_metadata(table_name).await?; - // Drop the table from the Delta Lake. self.drop_table(table_name).await?; diff --git a/crates/modelardb_server/src/cluster.rs b/crates/modelardb_server/src/cluster.rs index b98b4889..77b8f8a8 100644 --- a/crates/modelardb_server/src/cluster.rs +++ b/crates/modelardb_server/src/cluster.rs @@ -195,10 +195,6 @@ impl Cluster { pub(crate) async fn drop_cluster_tables(&self, table_names: &[String]) -> Result<()> { // Drop the tables from the remote data folder. for table_name in table_names { - self.remote_data_folder - .drop_table_metadata(table_name) - .await?; - self.remote_data_folder.drop_table(table_name).await?; } diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 8de567c4..c015e925 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -254,12 +254,6 @@ impl Context { self.drop_table_from_storage_engine(table_name).await?; - // Drop the table metadata from the Delta Lake. - self.data_folders - .local_data_folder - .drop_table_metadata(table_name) - .await?; - // Drop the table from the Delta Lake. self.data_folders .local_data_folder From 265d2c6579593866e762aa78e5af050768c0344e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:46:50 +0100 Subject: [PATCH 09/48] Fix minor doc issue --- .../modelardb_storage/src/data_folder/mod.rs | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 2f9d602b..fd90fe3e 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -578,10 +578,10 @@ impl DataFolder { format!("{}/{METADATA_FOLDER}/{table_name}", self.location) } - /// Create a Delta Lake table for a normal table with `table_name` and `schema` if it does not - /// already exist and save the table metadata to the `normal_table_metadata` table. If the - /// normal table could not be created, e.g., because it already exists, - /// [`ModelarDbStorageError`] is returned. + /// Create a Delta Lake table for a normal table with `table_name` and `schema` and save the + /// table metadata to the `normal_table_metadata` table. If the table already exists or + /// the metadata could not be saved, return [`ModelarDbStorageError`], otherwise return + /// the created [`DeltaTable`]. pub async fn create_normal_table( &self, table_name: &str, @@ -615,10 +615,10 @@ impl DataFolder { Ok(()) } - /// Create a Delta Lake table for a time series table with `time_series_table_metadata` if it - /// does not already exist and save the table metadata to the `time_series_table_metadata` - /// table. Returns [`DeltaTable`] if the table could be created and [`ModelarDbStorageError`] - /// if it could not. + /// Create a Delta Lake table for a time series table with `time_series_table_metadata` and + /// save the table metadata to the `time_series_table_metadata` table. If the table already + /// exists or the metadata could not be saved, return [`ModelarDbStorageError`], otherwise + /// return the created [`DeltaTable`]. pub async fn create_time_series_table( &self, time_series_table_metadata: &TimeSeriesTableMetadata, @@ -1520,12 +1520,7 @@ mod tests { async fn test_drop_missing_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - assert!( - data_folder - .drop_table("missing_table") - .await - .is_err() - ); + assert!(data_folder.drop_table("missing_table").await.is_err()); } async fn create_data_folder_and_create_normal_tables() -> (TempDir, DataFolder) { From c1e5c80f5a1d8dd6ed251f6855ba99793dcbf55a Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 12:57:59 +0100 Subject: [PATCH 10/48] Remove unused public method and make public methods private --- crates/modelardb_storage/src/data_folder/mod.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index fd90fe3e..484ab36b 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -512,7 +512,7 @@ impl DataFolder { /// Return a [`DeltaTableWriter`] for writing to the time series table corresponding to /// `delta_table` in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta /// Lake cannot be established or the table does not exist. - pub async fn time_series_table_writer( + async fn time_series_table_writer( &self, delta_table: DeltaTable, ) -> Result { @@ -547,7 +547,7 @@ impl DataFolder { /// Return a [`DeltaTableWriter`] for writing to the table corresponding to `delta_table` in the /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be /// established or the table does not exist. - pub async fn normal_or_metadata_table_writer( + async fn normal_or_metadata_table_writer( &self, delta_table: DeltaTable, ) -> Result { @@ -768,16 +768,6 @@ impl DataFolder { Ok(delta_table) } - /// Drop the metadata table with `table_name` from the Delta Lake by deleting every file related - /// to the table. The table folder cannot be deleted directly since folders do not exist in - /// object stores and therefore cannot be operated upon. If the table was dropped successfully, - /// the paths to the deleted files are returned, otherwise a [`ModelarDbStorageError`] is - /// returned. - pub async fn drop_metadata_table(&self, table_name: &str) -> Result> { - let table_path = format!("{METADATA_FOLDER}/{table_name}"); - self.delete_table_files(&table_path).await - } - /// Drop the Delta Lake table with `table_name` from the Delta Lake by dropping the table /// metadata and deleting every file related to the table. The table folder cannot be deleted /// directly since folders do not exist in object stores and therefore cannot be operated upon. From 6728a459cbabad29aedc5a2d20a54ec3c9ddeb72 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:41:31 +0100 Subject: [PATCH 11/48] Remove public register_metadata_table function --- crates/modelardb_storage/src/lib.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index ad1cd3e7..6ff9f993 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -41,7 +41,6 @@ use datafusion::parquet::basic::{Compression, Encoding, ZstdLevel}; use datafusion::parquet::errors::ParquetError; use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; use datafusion::prelude::SessionContext; -use datafusion::sql::TableReference; use datafusion::sql::parser::Statement as DFStatement; use deltalake::DeltaTable; use deltalake::parquet::file::metadata::SortingColumn; @@ -52,7 +51,6 @@ use object_store::path::Path; use sqlparser::ast::Statement; use crate::error::Result; -use crate::query::metadata_table::MetadataTable; use crate::query::normal_table::NormalTable; use crate::query::time_series_table::TimeSeriesTable; @@ -88,21 +86,6 @@ pub fn create_session_context() -> SessionContext { session_context } -/// Register the metadata table stored in `delta_table` with `table_name` in `session_context`. If -/// the metadata table could not be registered with Apache DataFusion, return -/// [`ModelarDbStorageError`](error::ModelarDbStorageError). -pub fn register_metadata_table( - session_context: &SessionContext, - table_name: &str, - delta_table: DeltaTable, -) -> Result<()> { - let table_reference = TableReference::partial("metadata", table_name); - let metadata_table = Arc::new(MetadataTable::new(delta_table)); - session_context.register_table(table_reference, metadata_table)?; - - Ok(()) -} - /// Register the normal table stored in `delta_table` with `table_name` and `data_sink` in /// `session_context`. If the normal table could not be registered with Apache DataFusion, return /// [`ModelarDbStorageError`](error::ModelarDbStorageError). From 267eb51a275f8fc24b559dd625172d276bf99f33 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:44:59 +0100 Subject: [PATCH 12/48] Combine create_metadata_table and register_metadata_table --- .../modelardb_storage/src/data_folder/mod.rs | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 484ab36b..3e6ddfab 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -555,22 +555,30 @@ impl DataFolder { DeltaTableWriter::try_new(delta_table, vec![], writer_properties) } - /// Create a Delta Lake table for a metadata table with `table_name` and `schema` if it does not - /// already exist. If the metadata table could not be created, [`ModelarDbStorageError`] is - /// returned. An error is not returned if the metadata table already exists. - pub async fn create_metadata_table( + /// Create a Delta Lake table for a metadata table with `table_name` and `schema` and register + /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists or the + /// table could not be registered, return [`ModelarDbStorageError`]. + pub async fn create_and_register_metadata_table( &self, table_name: &str, schema: &Schema, - ) -> Result { - self.create_delta_lake_table( - table_name, - schema, - &[], - self.location_of_metadata_table(table_name), - SaveMode::Ignore, - ) - .await + ) -> Result<()> { + let delta_table = self + .create_delta_lake_table( + table_name, + schema, + &[], + self.location_of_metadata_table(table_name), + SaveMode::Ignore, + ) + .await?; + + let table_reference = TableReference::partial("metadata", table_name); + let metadata_table = Arc::new(MetadataTable::new(delta_table)); + self.session_context + .register_table(table_reference, metadata_table)?; + + Ok(()) } /// Return the location of the metadata table with `table_name`. From 8cec09e3d67b1f6b525fb9a712b7e689a90f0993 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:46:03 +0100 Subject: [PATCH 13/48] Update use of create_metadata_table to use new method --- .../src/data_folder/cluster.rs | 34 ++++----- .../modelardb_storage/src/data_folder/mod.rs | 76 +++++++------------ 2 files changed, 41 insertions(+), 69 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/cluster.rs b/crates/modelardb_storage/src/data_folder/cluster.rs index e21dbd9b..0c76e9ef 100644 --- a/crates/modelardb_storage/src/data_folder/cluster.rs +++ b/crates/modelardb_storage/src/data_folder/cluster.rs @@ -28,7 +28,7 @@ use uuid::Uuid; use crate::data_folder::DataFolder; use crate::error::Result; -use crate::{register_metadata_table, sql_and_concat}; +use crate::sql_and_concat; /// Trait that extends [`DataFolder`] to provide management of the Delta Lake for the cluster. #[allow(async_fn_in_trait)] @@ -51,27 +51,21 @@ impl ClusterMetadata for DataFolder { /// [`ModelarDbStorageError`](crate::error::ModelarDbStorageError). async fn create_and_register_cluster_metadata_tables(&self) -> Result<()> { // Create and register the cluster_metadata table if it does not exist. - let delta_table = self - .create_metadata_table( - "cluster_metadata", - &Schema::new(vec![Field::new("key", DataType::Utf8, false)]), - ) - .await?; - - register_metadata_table(self.session_context(), "cluster_metadata", delta_table)?; + self.create_and_register_metadata_table( + "cluster_metadata", + &Schema::new(vec![Field::new("key", DataType::Utf8, false)]), + ) + .await?; // Create and register the nodes table if it does not exist. - let delta_table = self - .create_metadata_table( - "nodes", - &Schema::new(vec![ - Field::new("url", DataType::Utf8, false), - Field::new("mode", DataType::Utf8, false), - ]), - ) - .await?; - - register_metadata_table(self.session_context(), "nodes", delta_table)?; + self.create_and_register_metadata_table( + "nodes", + &Schema::new(vec![ + Field::new("url", DataType::Utf8, false), + Field::new("mode", DataType::Utf8, false), + ]), + ) + .await?; Ok(()) } diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 3e6ddfab..628dd67d 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -30,7 +30,7 @@ use arrow::datatypes::{DataType, Field, Schema}; use chrono::TimeDelta; use dashmap::DashMap; use datafusion::catalog::TableProvider; -use datafusion::common::{DFSchema, ToDFSchema}; +use datafusion::common::{DFSchema, TableReference, ToDFSchema}; use datafusion::datasource::sink::DataSink; use datafusion::logical_expr::{Expr, lit}; use datafusion::parquet::file::properties::WriterProperties; @@ -62,10 +62,8 @@ use url::Url; use uuid::Uuid; use crate::error::{ModelarDbStorageError, Result}; -use crate::{ - METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, register_metadata_table, - sql_and_concat, -}; +use crate::query::metadata_table::MetadataTable; +use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, sql_and_concat}; /// Types of tables supported by ModelarDB. enum TableType { @@ -285,54 +283,37 @@ impl DataFolder { /// [`ModelarDbStorageError`]. async fn create_and_register_metadata_tables(&self) -> Result<()> { // Create and register the normal_table_metadata table if it does not exist. - let delta_table = self - .create_metadata_table( - "normal_table_metadata", - &Schema::new(vec![Field::new("table_name", DataType::Utf8, false)]), - ) - .await?; - - register_metadata_table(&self.session_context, "normal_table_metadata", delta_table)?; + self.create_and_register_metadata_table( + "normal_table_metadata", + &Schema::new(vec![Field::new("table_name", DataType::Utf8, false)]), + ) + .await?; // Create and register the time_series_table_metadata table if it does not exist. - let delta_table = self - .create_metadata_table( - "time_series_table_metadata", - &Schema::new(vec![ - Field::new("table_name", DataType::Utf8, false), - Field::new("query_schema", DataType::Binary, false), - ]), - ) - .await?; - - register_metadata_table( - &self.session_context, + self.create_and_register_metadata_table( "time_series_table_metadata", - delta_table, - )?; + &Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("query_schema", DataType::Binary, false), + ]), + ) + .await?; // Create and register the time_series_table_field_columns table if it does not exist. Note // that column_index will only use a maximum of 10 bits. generated_column_expr is NULL if // the fields are stored as segments. - let delta_table = self - .create_metadata_table( - "time_series_table_field_columns", - &Schema::new(vec![ - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("column_index", DataType::Int16, false), - Field::new("error_bound_value", DataType::Float32, false), - Field::new("error_bound_is_relative", DataType::Boolean, false), - Field::new("generated_column_expr", DataType::Binary, true), - ]), - ) - .await?; - - register_metadata_table( - &self.session_context, + self.create_and_register_metadata_table( "time_series_table_field_columns", - delta_table, - )?; + &Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("column_index", DataType::Int16, false), + Field::new("error_bound_value", DataType::Float32, false), + Field::new("error_bound_is_relative", DataType::Boolean, false), + Field::new("generated_column_expr", DataType::Binary, true), + ]), + ) + .await?; Ok(()) } @@ -512,10 +493,7 @@ impl DataFolder { /// Return a [`DeltaTableWriter`] for writing to the time series table corresponding to /// `delta_table` in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta /// Lake cannot be established or the table does not exist. - async fn time_series_table_writer( - &self, - delta_table: DeltaTable, - ) -> Result { + async fn time_series_table_writer(&self, delta_table: DeltaTable) -> Result { let partition_columns = vec![FIELD_COLUMN.to_owned()]; // Specify that the file must be sorted by the tag columns and then by start_time. From 54bae4b1220bb091ddd7b4f518eabdd7c17adc90 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 14:18:34 +0100 Subject: [PATCH 14/48] Update mod.rs --- crates/modelardb_storage/src/data_folder/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 628dd67d..9ee2e534 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -534,8 +534,9 @@ impl DataFolder { } /// Create a Delta Lake table for a metadata table with `table_name` and `schema` and register - /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists or the - /// table could not be registered, return [`ModelarDbStorageError`]. + /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists, it is + /// reused. Return [`ModelarDbStorageError`] if the table cannot be created or cannot be + /// registered. pub async fn create_and_register_metadata_table( &self, table_name: &str, From b4d1f0eac3ea7a2e020c6a6b46ce6c07d72dda06 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 18 Mar 2026 17:18:10 +0100 Subject: [PATCH 15/48] Fix inconsistent naming for drop_table --- crates/modelardb_storage/src/data_folder/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 9ee2e534..ef3be87f 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -755,22 +755,22 @@ impl DataFolder { Ok(delta_table) } - /// Drop the Delta Lake table with `table_name` from the Delta Lake by dropping the table + /// Drop the Delta Lake table with `table_name` from the Delta Lake by deleting the table /// metadata and deleting every file related to the table. The table folder cannot be deleted /// directly since folders do not exist in object stores and therefore cannot be operated upon. /// If the table was dropped successfully, the paths to the deleted files are returned, /// otherwise a [`ModelarDbStorageError`] is returned. pub async fn drop_table(&self, table_name: &str) -> Result> { - self.drop_table_metadata(table_name).await?; + self.delete_table_metadata(table_name).await?; let table_path = format!("{TABLE_FOLDER}/{table_name}"); self.delete_table_files(&table_path).await } - /// Depending on the type of the table with `table_name`, drop either the normal table metadata + /// Depending on the type of the table with `table_name`, delete either the normal table metadata /// or the time series table metadata from the Delta Lake. If the table does not exist or the - /// metadata could not be dropped, [`ModelarDbStorageError`] is returned. - async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { + /// metadata could not be deleted, [`ModelarDbStorageError`] is returned. + async fn delete_table_metadata(&self, table_name: &str) -> Result<()> { if self.is_normal_table(table_name).await? { let delta_table = self.metadata_delta_table("normal_table_metadata").await?; From 67e2aba863e4bd7e91d73b8c1227f65600105c76 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:03:25 +0100 Subject: [PATCH 16/48] Add optional data sink to NormalTable struct --- crates/modelardb_storage/src/lib.rs | 2 +- .../src/query/normal_table.rs | 39 ++++++++++++------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 6ff9f993..2c5bebe9 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -95,7 +95,7 @@ pub fn register_normal_table( delta_table: DeltaTable, data_sink: Arc, ) -> Result<()> { - let normal_table = Arc::new(NormalTable::new(delta_table, data_sink)); + let normal_table = Arc::new(NormalTable::new(delta_table, Some(data_sink))); session_context.register_table(table_name, normal_table)?; Ok(()) diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index a30f74d7..73d446d3 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -13,10 +13,10 @@ * limitations under the License. */ -//! Implementation of [`NormalTable`] which allows normal tables to be queried through Apache DataFusion. -//! It wraps a [`DeltaTable`] and forwards most method calls to it. However, for -//! [`TableProvider::scan()`] it updates the [`DeltaTable`] to the latest version and it implements -//! [`TableProvider::insert_into()`] so rows can be inserted with INSERT. +//! Implementation of [`NormalTable`] which allows normal tables and metadata tables to be queried +//! through Apache DataFusion. It wraps a [`DeltaTable`] and forwards most method calls to it. +//! However, for [`TableProvider::scan()`] it updates the [`DeltaTable`] to the latest version. If a +//! [`DataSink`] is provided, [`TableProvider::insert_into()`] is also supported. use std::borrow::Cow; use std::{any::Any, sync::Arc}; @@ -33,23 +33,26 @@ use datafusion::physical_plan::{ExecutionPlan, Statistics}; use deltalake::DeltaTable; use tonic::async_trait; -/// A queryable representation of a normal table. [`NormalTable`] wraps the [`TableProvider`] -/// [`DeltaTable`] and passes most methods calls directly to it. Thus, it can be registered with -/// Apache DataFusion. [`DeltaTable`] is extended in two ways, `delta_table` is updated to the -/// latest snapshot when accessed and support for inserting has been added. +/// A queryable representation of a normal table or a metadata table. [`NormalTable`] wraps the +/// [`TableProvider`] of [`DeltaTable`] and passes most method calls directly to it. Thus, it can be +/// registered with Apache DataFusion. [`DeltaTable`] is extended in two ways, `delta_table` is +/// updated to the latest snapshot when accessed and, if a [`DataSink`] is provided, support for +/// inserting data with INSERT has been added. Metadata tables are registered without a [`DataSink`] +/// since they are managed internally and should not be modified by users. #[derive(Debug)] pub(crate) struct NormalTable { /// Access to the Delta Lake table. delta_table: DeltaTable, - /// Where data should be written to. - data_sink: Arc, + /// Where data should be written to. [`None`] for metadata tables since they should not support + /// INSERT as they are managed internally. + maybe_data_sink: Option>, } impl NormalTable { - pub(crate) fn new(delta_table: DeltaTable, data_sink: Arc) -> Self { + pub(crate) fn new(delta_table: DeltaTable, maybe_data_sink: Option>) -> Self { Self { delta_table, - data_sink, + maybe_data_sink, } } } @@ -128,15 +131,21 @@ impl TableProvider for NormalTable { /// Create an [`ExecutionPlan`] that will insert the result of `input` into the normal table. /// Generally, [`arrow_flight::flight_service_server::FlightService::do_put()`] should be used - /// instead of this method as it is more efficient. Returns a [`DataFusionError::Plan`] if the - /// necessary metadata cannot be retrieved from the Delta Lake. + /// instead of this method as it is more efficient. Returns a [`DataFusionError::Plan`] if no + /// [`DataSink`] was provided (the table is a metadata table) or if the necessary metadata + /// cannot be retrieved from the Delta Lake. async fn insert_into( &self, _state: &dyn Session, input: Arc, _insert_op: InsertOp, ) -> DataFusionResult> { - let file_sink = Arc::new(DataSinkExec::new(input, self.data_sink.clone(), None)); + let data_sink = self.maybe_data_sink.clone().ok_or_else(|| { + DataFusionError::Plan("INSERT is not supported for metadata tables.".to_owned()) + })?; + + let file_sink = Arc::new(DataSinkExec::new(input, data_sink, None)); + Ok(file_sink) } } From 2665d286745d68f813305ef989c3ef0f9ce0e7cc Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:08:52 +0100 Subject: [PATCH 17/48] Remove no longer used MetadataTable struct --- .../modelardb_storage/src/data_folder/mod.rs | 4 +- .../src/query/metadata_table.rs | 85 ------------------- crates/modelardb_storage/src/query/mod.rs | 6 +- 3 files changed, 4 insertions(+), 91 deletions(-) delete mode 100644 crates/modelardb_storage/src/query/metadata_table.rs diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index ef3be87f..4ac35ff0 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -62,7 +62,7 @@ use url::Url; use uuid::Uuid; use crate::error::{ModelarDbStorageError, Result}; -use crate::query::metadata_table::MetadataTable; +use crate::query::normal_table::NormalTable; use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, sql_and_concat}; /// Types of tables supported by ModelarDB. @@ -553,7 +553,7 @@ impl DataFolder { .await?; let table_reference = TableReference::partial("metadata", table_name); - let metadata_table = Arc::new(MetadataTable::new(delta_table)); + let metadata_table = Arc::new(NormalTable::new(delta_table, None)); self.session_context .register_table(table_reference, metadata_table)?; diff --git a/crates/modelardb_storage/src/query/metadata_table.rs b/crates/modelardb_storage/src/query/metadata_table.rs deleted file mode 100644 index d93bce26..00000000 --- a/crates/modelardb_storage/src/query/metadata_table.rs +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright 2024 The ModelarDB Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//! Implementation of [`MetadataTable`] which allows metadata tables to be queried through Apache -//! DataFusion. It wraps a [`DeltaTable`] and forwards most method calls to it. However, for -//! [`TableProvider::scan()`] it updates the [`DeltaTable`] to the latest version. - -use std::{any::Any, sync::Arc}; - -use arrow::datatypes::Schema; -use datafusion::catalog::Session; -use datafusion::datasource::{TableProvider, TableType}; -use datafusion::error::{DataFusionError, Result as DataFusionResult}; -use datafusion::logical_expr::Expr; -use datafusion::physical_plan::ExecutionPlan; -use deltalake::DeltaTable; -use tonic::async_trait; - -/// A queryable representation of a metadata table. [`MetadataTable`] wraps the [`TableProvider`] of -/// [`DeltaTable`] and passes most methods calls directly to it. Thus, it can be registered with -/// Apache DataFusion. The only difference from [`DeltaTable`] is that `delta_table` is updated to -/// the latest snapshot when accessed. -#[derive(Debug)] -pub(crate) struct MetadataTable { - /// Access to the Delta Lake table. - delta_table: DeltaTable, -} - -impl MetadataTable { - pub(crate) fn new(delta_table: DeltaTable) -> Self { - Self { delta_table } - } -} - -#[async_trait] -impl TableProvider for MetadataTable { - /// Return `self` as [`Any`] so it can be downcast. - fn as_any(&self) -> &dyn Any { - self.delta_table.as_any() - } - - /// Return the query schema of the metadata table registered with Apache DataFusion. - fn schema(&self) -> Arc { - TableProvider::schema(&self.delta_table) - } - - /// Specify that metadata tables are base tables and not views or temporary tables. - fn table_type(&self) -> TableType { - self.delta_table.table_type() - } - - /// Create an [`ExecutionPlan`] that will scan the metadata table. Returns a - /// [`DataFusionError::Plan`] if the necessary metadata cannot be retrieved. - async fn scan( - &self, - state: &dyn Session, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, - ) -> DataFusionResult> { - // Clone the Delta Lake table and update it to the latest version. self.data_folder.load( - // &mut self) is not an option due to TypeProvider::scan(&self, ...). Storing the DeltaTable - // in a Mutex and RwLock is also not an option since most of the methods in TypeProvider - // return a reference and the locks will be dropped at the end of the method. - let mut delta_table = self.delta_table.clone(); - delta_table - .load() - .await - .map_err(|error| DataFusionError::Plan(error.to_string()))?; - - delta_table.scan(state, projection, filters, limit).await - } -} diff --git a/crates/modelardb_storage/src/query/mod.rs b/crates/modelardb_storage/src/query/mod.rs index d3f2d3fd..05fd544a 100644 --- a/crates/modelardb_storage/src/query/mod.rs +++ b/crates/modelardb_storage/src/query/mod.rs @@ -13,15 +13,13 @@ * limitations under the License. */ -//! Implementation of types which allow normal tables, metadata tables, and time series tables to be -//! added to Apache DataFusion. This allows them to be queried and small amounts of data to be added -//! with INSERT. +//! Implementation of types which allow normal tables and time series tables to be added to Apache +//! DataFusion. This allows them to be queried and small amounts of data to be added with INSERT. // grid_exec and sorted_join_exec are pub(crate) so the rules added to Apache DataFusion's physical // optimizer can access them. mod generated_as_exec; pub(crate) mod grid_exec; -pub(crate) mod metadata_table; pub(crate) mod normal_table; pub(crate) mod sorted_join_exec; pub(crate) mod time_series_table; From 4acbe6718daade57cab51b5a1567cb3b13850c10 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:12:05 +0100 Subject: [PATCH 18/48] Make metadata methods private --- crates/modelardb_storage/src/data_folder/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 4ac35ff0..176a684b 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -362,7 +362,7 @@ impl DataFolder { /// Return a [`DeltaTable`] for manipulating the metadata table with `table_name` in the /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be /// established or the table does not exist. - pub async fn metadata_delta_table(&self, table_name: &str) -> Result { + async fn metadata_delta_table(&self, table_name: &str) -> Result { let table_path = self.location_of_metadata_table(table_name); self.delta_table_from_path(&table_path).await } @@ -537,7 +537,7 @@ impl DataFolder { /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists, it is /// reused. Return [`ModelarDbStorageError`] if the table cannot be created or cannot be /// registered. - pub async fn create_and_register_metadata_table( + async fn create_and_register_metadata_table( &self, table_name: &str, schema: &Schema, @@ -864,7 +864,7 @@ impl DataFolder { /// Write `columns` to a Delta Lake table with `table_name`. Returns an updated [`DeltaTable`] /// version if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. - pub async fn write_columns_to_metadata_table( + async fn write_columns_to_metadata_table( &self, table_name: &str, columns: Vec, From c3d278ede5cd56641205be281e9cb8d510acbe34 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:23:09 +0100 Subject: [PATCH 19/48] Move DeltaTableWriter to a seperate file in the same module --- crates/modelardb_bulkloader/src/main.rs | 3 +- .../src/data_folder/delta_table_writer.rs | 195 ++++++++++++++++++ .../modelardb_storage/src/data_folder/mod.rs | 170 +-------------- 3 files changed, 201 insertions(+), 167 deletions(-) create mode 100644 crates/modelardb_storage/src/data_folder/delta_table_writer.rs diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index 0a6f9fcc..5e153eb4 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -38,7 +38,8 @@ use deltalake::{ObjectStore, Path}; use futures::stream::StreamExt; use modelardb_embedded::error::{ModelarDbEmbeddedError, Result}; use modelardb_embedded::operations::Operations; -use modelardb_storage::data_folder::{DataFolder, DeltaTableWriter}; +use modelardb_storage::data_folder::DataFolder; +use modelardb_storage::data_folder::delta_table_writer::DeltaTableWriter; use modelardb_types::types::TimeSeriesTableMetadata; use sysinfo::System; diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs new file mode 100644 index 00000000..65453306 --- /dev/null +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -0,0 +1,195 @@ +/* Copyright 2026 The ModelarDB Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Implementation of [`DeltaTableWriter`] for transactionally writing +//! [`RecordBatches`](RecordBatch) to a Delta table stored in an object store. Writing can be +//! committed or rolled back to ensure that the Delta table is always in a consistent state. + +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow::datatypes::Schema; +use datafusion::catalog::TableProvider; +use datafusion::parquet::file::properties::WriterProperties; +use delta_kernel::table_properties::DataSkippingNumIndexedCols; +use deltalake::delta_datafusion::DeltaDataChecker; +use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; +use deltalake::kernel::{Action, Add}; +use deltalake::operations::write::writer::{DeltaWriter, WriterConfig}; +use deltalake::protocol::{DeltaOperation, SaveMode}; +use deltalake::DeltaTable; +use object_store::path::Path; +use object_store::ObjectStore; +use uuid::Uuid; + +use crate::error::{ModelarDbStorageError, Result}; + +/// Functionality for transactionally writing [`RecordBatches`](RecordBatch) to a Delta table stored +/// in an object store. +pub struct DeltaTableWriter { + /// Delta table that all of the record batches will be written to. + delta_table: DeltaTable, + /// Checker that ensures all of the record batches match the table. + delta_data_checker: DeltaDataChecker, + /// Write operation that will be committed to the Delta table. + delta_operation: DeltaOperation, + /// Unique identifier for this write operation to the Delta table. + operation_id: Uuid, + /// Writes record batches to the Delta table as Apache Parquet files. + delta_writer: DeltaWriter, +} + +impl DeltaTableWriter { + /// Create a new [`DeltaTableWriter`]. Returns a [`ModelarDbStorageError`] if the state of the + /// Delta table cannot be loaded from `delta_table`. + pub fn try_new( + delta_table: DeltaTable, + partition_columns: Vec, + writer_properties: WriterProperties, + ) -> Result { + // Checker for if record batches match the table’s invariants, constraints, and nullability. + let delta_table_state = delta_table.snapshot()?; + let snapshot = delta_table_state.snapshot(); + let delta_data_checker = DeltaDataChecker::new(snapshot); + + // Operation that will be committed. + let delta_operation = DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: if partition_columns.is_empty() { + None + } else { + Some(partition_columns.clone()) + }, + predicate: None, + }; + + // A UUID version 4 is used as the operation id to match the existing Operation trait in the + // deltalake crate as it is pub(trait) and thus cannot be used directly in DeltaTableWriter. + let operation_id = Uuid::new_v4(); + + // Writer that will write the record batches. + let object_store = delta_table.log_store().object_store(Some(operation_id)); + let table_schema: Arc = TableProvider::schema(&delta_table); + let num_indexed_cols = + DataSkippingNumIndexedCols::NumColumns(table_schema.fields.len() as u64); + let writer_config = WriterConfig::new( + table_schema, + partition_columns, + Some(writer_properties), + None, + None, + num_indexed_cols, + None, + ); + let delta_writer = DeltaWriter::new(object_store, writer_config); + + Ok(Self { + delta_table, + delta_data_checker, + delta_operation, + operation_id, + delta_writer, + }) + } + + /// Write `record_batch` to the Delta table. Returns a [`ModelarDbStorageError`] if the + /// [`RecordBatches`](RecordBatch) does not match the schema of the Delta table or if the + /// writing fails. + pub async fn write(&mut self, record_batch: &RecordBatch) -> Result<()> { + self.delta_data_checker.check_batch(record_batch).await?; + self.delta_writer.write(record_batch).await?; + Ok(()) + } + + /// Write all `record_batches` to the Delta table. Returns a [`ModelarDbStorageError`] if one of + /// the [`RecordBatches`](RecordBatch) does not match the schema of the Delta table or if the + /// writing fails. + pub async fn write_all(&mut self, record_batches: &[RecordBatch]) -> Result<()> { + for record_batch in record_batches { + self.write(record_batch).await?; + } + Ok(()) + } + + /// Consume the [`DeltaTableWriter`], finish the writing, and commit the files that have been + /// written to the log. If an error occurs before the commit is finished, the already written + /// files are deleted if possible. Returns a [`ModelarDbStorageError`] if an error occurs when + /// finishing the writing, committing the files that have been written, deleting the written + /// files, or updating the [`DeltaTable`]. + pub async fn commit(mut self) -> Result { + // Write the remaining buffered files. + let added_files = self.delta_writer.close().await?; + + // Clone added_files in case of rollback. + let actions = added_files + .clone() + .into_iter() + .map(Action::Add) + .collect::>(); + + // Prepare all inputs to the commit. + let object_store = self.delta_table.object_store(); + let commit_properties = CommitProperties::default(); + let table_data = match self.delta_table.snapshot() { + Ok(table_data) => table_data, + Err(delta_table_error) => { + delete_added_files(&object_store, added_files).await?; + return Err(ModelarDbStorageError::DeltaLake(delta_table_error)); + } + }; + let log_store = self.delta_table.log_store(); + + // Construct the commit to be written. + let commit_builder = CommitBuilder::from(commit_properties) + .with_actions(actions) + .with_operation_id(self.operation_id) + .build(Some(table_data), log_store, self.delta_operation); + + // Write the commit to the Delta table. + let _finalized_commit = match commit_builder.await { + Ok(finalized_commit) => finalized_commit, + Err(delta_table_error) => { + delete_added_files(&object_store, added_files).await?; + return Err(ModelarDbStorageError::DeltaLake(delta_table_error)); + } + }; + + // Return Delta table with the commit. + self.delta_table.load().await?; + Ok(self.delta_table) + } + + /// Consume the [`DeltaTableWriter`], abort the writing, and delete all of the files that have + /// already been written. Returns a [`ModelarDbStorageError`] if an error occurs when aborting + /// the writing or deleting the files that have already been written. Rollback is not called + /// automatically as drop() is not async and async_drop() is not yet a stable API. + pub async fn rollback(self) -> Result { + let object_store = self.delta_table.object_store(); + let added_files = self.delta_writer.close().await?; + delete_added_files(&object_store, added_files).await?; + Ok(self.delta_table) + } +} + +/// Delete the `added_files` from `object_store`. Returns a [`ModelarDbStorageError`] if a file +/// could not be deleted. It is a function instead of a method on [`DeltaTableWriter`] so it can be +/// called by [`DeltaTableWriter`] after the [`DeltaWriter`] is closed without lifetime issues. +async fn delete_added_files(object_store: &dyn ObjectStore, added_files: Vec) -> Result<()> { + for add_file in added_files { + let path: Path = Path::from(add_file.path); + object_store.delete(&path).await?; + } + Ok(()) +} diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 176a684b..499bfdcc 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -16,6 +16,7 @@ //! Implementation of the type used to interact with local and remote storage through a Delta Lake. pub mod cluster; +pub mod delta_table_writer; use std::collections::HashMap; use std::path::Path as StdPath; @@ -33,18 +34,13 @@ use datafusion::catalog::TableProvider; use datafusion::common::{DFSchema, TableReference, ToDFSchema}; use datafusion::datasource::sink::DataSink; use datafusion::logical_expr::{Expr, lit}; -use datafusion::parquet::file::properties::WriterProperties; use datafusion::prelude::{SessionContext, col}; use datafusion_proto::bytes::Serializeable; use delta_kernel::engine::arrow_conversion::TryIntoKernel; -use delta_kernel::table_properties::DataSkippingNumIndexedCols; -use deltalake::delta_datafusion::DeltaDataChecker; -use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; -use deltalake::kernel::{Action, Add, StructField}; +use deltalake::kernel::StructField; use deltalake::operations::create::CreateBuilder; -use deltalake::operations::write::writer::{DeltaWriter, WriterConfig}; use deltalake::parquet::file::metadata::SortingColumn; -use deltalake::protocol::{DeltaOperation, SaveMode}; +use deltalake::protocol::SaveMode; use deltalake::{DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_types::functions::{try_convert_bytes_to_schema, try_convert_schema_to_bytes}; @@ -59,8 +55,8 @@ use object_store::local::LocalFileSystem; use object_store::memory::InMemory; use object_store::path::Path; use url::Url; -use uuid::Uuid; +use crate::data_folder::delta_table_writer::DeltaTableWriter; use crate::error::{ModelarDbStorageError, Result}; use crate::query::normal_table::NormalTable; use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, sql_and_concat}; @@ -1093,164 +1089,6 @@ impl DataFolder { } } -/// Functionality for transactionally writing [`RecordBatches`](RecordBatch) to a Delta table stored -/// in an object store. -pub struct DeltaTableWriter { - /// Delta table that all of the record batches will be written to. - delta_table: DeltaTable, - /// Checker that ensures all of the record batches match the table. - delta_data_checker: DeltaDataChecker, - /// Write operation that will be committed to the Delta table. - delta_operation: DeltaOperation, - /// Unique identifier for this write operation to the Delta table. - operation_id: Uuid, - /// Writes record batches to the Delta table as Apache Parquet files. - delta_writer: DeltaWriter, -} - -impl DeltaTableWriter { - /// Create a new [`DeltaTableWriter`]. Returns a [`ModelarDbStorageError`] if the state of the - /// Delta table cannot be loaded from `delta_table`. - pub fn try_new( - delta_table: DeltaTable, - partition_columns: Vec, - writer_properties: WriterProperties, - ) -> Result { - // Checker for if record batches match the table’s invariants, constraints, and nullability. - let delta_table_state = delta_table.snapshot()?; - let snapshot = delta_table_state.snapshot(); - let delta_data_checker = DeltaDataChecker::new(snapshot); - - // Operation that will be committed. - let delta_operation = DeltaOperation::Write { - mode: SaveMode::Append, - partition_by: if partition_columns.is_empty() { - None - } else { - Some(partition_columns.clone()) - }, - predicate: None, - }; - - // A UUID version 4 is used as the operation id to match the existing Operation trait in the - // deltalake crate as it is pub(trait) and thus cannot be used directly in DeltaTableWriter. - let operation_id = Uuid::new_v4(); - - // Writer that will write the record batches. - let object_store = delta_table.log_store().object_store(Some(operation_id)); - let table_schema: Arc = TableProvider::schema(&delta_table); - let num_indexed_cols = - DataSkippingNumIndexedCols::NumColumns(table_schema.fields.len() as u64); - let writer_config = WriterConfig::new( - table_schema, - partition_columns, - Some(writer_properties), - None, - None, - num_indexed_cols, - None, - ); - let delta_writer = DeltaWriter::new(object_store, writer_config); - - Ok(Self { - delta_table, - delta_data_checker, - delta_operation, - operation_id, - delta_writer, - }) - } - - /// Write `record_batch` to the Delta table. Returns a [`ModelarDbStorageError`] if the - /// [`RecordBatches`](RecordBatch) does not match the schema of the Delta table or if the - /// writing fails. - pub async fn write(&mut self, record_batch: &RecordBatch) -> Result<()> { - self.delta_data_checker.check_batch(record_batch).await?; - self.delta_writer.write(record_batch).await?; - Ok(()) - } - - /// Write all `record_batches` to the Delta table. Returns a [`ModelarDbStorageError`] if one of - /// the [`RecordBatches`](RecordBatch) does not match the schema of the Delta table or if the - /// writing fails. - pub async fn write_all(&mut self, record_batches: &[RecordBatch]) -> Result<()> { - for record_batch in record_batches { - self.write(record_batch).await?; - } - Ok(()) - } - - /// Consume the [`DeltaTableWriter`], finish the writing, and commit the files that have been - /// written to the log. If an error occurs before the commit is finished, the already written - /// files are deleted if possible. Returns a [`ModelarDbStorageError`] if an error occurs when - /// finishing the writing, committing the files that have been written, deleting the written - /// files, or updating the [`DeltaTable`]. - pub async fn commit(mut self) -> Result { - // Write the remaining buffered files. - let added_files = self.delta_writer.close().await?; - - // Clone added_files in case of rollback. - let actions = added_files - .clone() - .into_iter() - .map(Action::Add) - .collect::>(); - - // Prepare all inputs to the commit. - let object_store = self.delta_table.object_store(); - let commit_properties = CommitProperties::default(); - let table_data = match self.delta_table.snapshot() { - Ok(table_data) => table_data, - Err(delta_table_error) => { - delete_added_files(&object_store, added_files).await?; - return Err(ModelarDbStorageError::DeltaLake(delta_table_error)); - } - }; - let log_store = self.delta_table.log_store(); - - // Construct the commit to be written. - let commit_builder = CommitBuilder::from(commit_properties) - .with_actions(actions) - .with_operation_id(self.operation_id) - .build(Some(table_data), log_store, self.delta_operation); - - // Write the commit to the Delta table. - let _finalized_commit = match commit_builder.await { - Ok(finalized_commit) => finalized_commit, - Err(delta_table_error) => { - delete_added_files(&object_store, added_files).await?; - return Err(ModelarDbStorageError::DeltaLake(delta_table_error)); - } - }; - - // Return Delta table with the commit. - self.delta_table.load().await?; - Ok(self.delta_table) - } - - /// Consume the [`DeltaTableWriter`], abort the writing, and delete all of the files that have - /// already been written. Returns a [`ModelarDbStorageError`] if an error occurs when aborting - /// the writing or deleting the files that have already been written. Rollback is not called - /// automatically as drop() is not async and async_drop() is not yet a stable API. - pub async fn rollback(self) -> Result { - let object_store = self.delta_table.object_store(); - let added_files = self.delta_writer.close().await?; - delete_added_files(&object_store, added_files).await?; - Ok(self.delta_table) - } -} - -/// Delete the `added_files` from `object_store`. Returns a [`ModelarDbStorageError`] if a file -/// could not be deleted. It is a function instead of a method on [`DeltaTableWriter`] so it can be -/// called by [`DeltaTableWriter`] after the [`DeltaWriter`] is closed without lifetime issues. -async fn delete_added_files(object_store: &dyn ObjectStore, added_files: Vec) -> Result<()> { - for add_file in added_files { - let path: Path = Path::from(add_file.path); - object_store.delete(&path).await?; - } - Ok(()) -} - #[cfg(test)] mod tests { use super::*; From 725b8b03d4bd20b3f7ecda6cdde5819814248c69 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:30:16 +0100 Subject: [PATCH 20/48] Use a simpler check for if the table is a time series table in table_writer --- crates/modelardb_storage/src/data_folder/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 499bfdcc..1255775c 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -475,11 +475,7 @@ impl DataFolder { /// the table does not exist. pub async fn table_writer(&self, table_name: &str) -> Result { let delta_table = self.delta_table(table_name).await?; - if self - .time_series_table_metadata_for_registered_time_series_table(table_name) - .await - .is_some() - { + if self.is_time_series_table(table_name).await? { self.time_series_table_writer(delta_table).await } else { self.normal_or_metadata_table_writer(delta_table).await From ac7a2a89eb3bd14bd474284152dcd2a64b14de6b Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:44:09 +0100 Subject: [PATCH 21/48] Use a single method for writing to both normal tables and time series tables --- .../src/operations/data_folder.rs | 12 ++++----- crates/modelardb_server/src/context.rs | 7 ++---- .../src/storage/compressed_data_manager.rs | 4 +-- .../src/storage/data_transfer.rs | 11 +++----- .../src/data_folder/delta_table_writer.rs | 4 +-- .../modelardb_storage/src/data_folder/mod.rs | 25 ++++--------------- 6 files changed, 21 insertions(+), 42 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 7f9a5c7b..92700c89 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -215,7 +215,7 @@ impl Operations for DataFolder { &uncompressed_data, )?; - self.write_compressed_segments_to_time_series_table(table_name, compressed_data) + self.write_record_batches(table_name, compressed_data) .await?; } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { // Normal table. @@ -223,7 +223,7 @@ impl Operations for DataFolder { return Err(schema_mismatch_error); } - self.write_record_batches_to_normal_table(table_name, vec![uncompressed_data]) + self.write_record_batches(table_name, vec![uncompressed_data]) .await?; } else { return Err(ModelarDbEmbeddedError::InvalidArgument(format!( @@ -282,7 +282,7 @@ impl Operations for DataFolder { let record_batches = common::collect(record_batch_stream).await?; target_data_folder - .write_record_batches_to_normal_table(target_table_name, record_batches) + .write_record_batches(target_table_name, record_batches) .await?; Ok(()) @@ -407,7 +407,7 @@ impl Operations for DataFolder { // Write read data to target_table_name in target. target_data_folder - .write_compressed_segments_to_time_series_table(target_table_name, record_batches) + .write_record_batches(target_table_name, record_batches) .await?; Ok(()) @@ -455,7 +455,7 @@ impl Operations for DataFolder { let record_batches: Vec = stream.try_collect().await?; target_data_folder - .write_compressed_segments_to_time_series_table(target_table_name, record_batches) + .write_record_batches(target_table_name, record_batches) .await?; } else if let (Some(source_normal_table_schema), Some(target_normal_table_schema)) = ( self.normal_table_schema(source_table_name).await, @@ -474,7 +474,7 @@ impl Operations for DataFolder { let record_batches: Vec = stream.try_collect().await?; target_data_folder - .write_record_batches_to_normal_table(target_table_name, record_batches) + .write_record_batches(target_table_name, record_batches) .await?; } else { return Err(ModelarDbEmbeddedError::InvalidArgument(format!( diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index c015e925..c88b7be6 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -747,10 +747,7 @@ mod tests { // Write data to the normal table. let local_data_folder = &context.data_folders.local_data_folder; local_data_folder - .write_record_batches_to_normal_table( - NORMAL_TABLE_NAME, - vec![table::normal_table_record_batch()], - ) + .write_record_batches(NORMAL_TABLE_NAME, vec![table::normal_table_record_batch()]) .await .unwrap(); @@ -818,7 +815,7 @@ mod tests { // Write data to the time series table. let local_data_folder = &context.data_folders.local_data_folder; local_data_folder - .write_compressed_segments_to_time_series_table( + .write_record_batches( TIME_SERIES_TABLE_NAME, vec![table::compressed_segments_record_batch()], ) diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 88931d12..52a42869 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -87,7 +87,7 @@ impl CompressedDataManager { let record_batch_size_in_bytes = record_batch.get_array_memory_size(); self.local_data_folder - .write_record_batches_to_normal_table(table_name, vec![record_batch]) + .write_record_batches(table_name, vec![record_batch]) .await?; // Inform the data transfer component about the new data if a remote data folder was @@ -246,7 +246,7 @@ impl CompressedDataManager { let compressed_data_buffer_size_in_bytes = compressed_data_buffer.size_in_bytes; let compressed_segments = compressed_data_buffer.record_batches(); self.local_data_folder - .write_compressed_segments_to_time_series_table(table_name, compressed_segments) + .write_record_batches(table_name, compressed_segments) .await?; // Inform the data transfer component about the new data if a remote data folder was diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 31d8471f..2a4be5ff 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -250,11 +250,11 @@ impl DataTransfer { .await? { self.remote_data_folder - .write_compressed_segments_to_time_series_table(table_name, record_batches) + .write_record_batches(table_name, record_batches) .await?; } else { self.remote_data_folder - .write_record_batches_to_normal_table(table_name, record_batches) + .write_record_batches(table_name, record_batches) .await?; } @@ -493,16 +493,13 @@ mod tests { for _ in 0..batch_write_count { // Write to the normal table. local_data_folder - .write_record_batches_to_normal_table( - NORMAL_TABLE_NAME, - vec![table::normal_table_record_batch()], - ) + .write_record_batches(NORMAL_TABLE_NAME, vec![table::normal_table_record_batch()]) .await .unwrap(); // Write to the time series table. local_data_folder - .write_compressed_segments_to_time_series_table( + .write_record_batches( TIME_SERIES_TABLE_NAME, vec![table::compressed_segments_record_batch()], ) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index 65453306..bb4eb3e4 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -24,14 +24,14 @@ use arrow::datatypes::Schema; use datafusion::catalog::TableProvider; use datafusion::parquet::file::properties::WriterProperties; use delta_kernel::table_properties::DataSkippingNumIndexedCols; +use deltalake::DeltaTable; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; use deltalake::kernel::{Action, Add}; use deltalake::operations::write::writer::{DeltaWriter, WriterConfig}; use deltalake::protocol::{DeltaOperation, SaveMode}; -use deltalake::DeltaTable; -use object_store::path::Path; use object_store::ObjectStore; +use object_store::path::Path; use uuid::Uuid; use crate::error::{ModelarDbStorageError, Result}; diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 1255775c..e0ea832b 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -868,34 +868,19 @@ impl DataFolder { .await } - /// Write `record_batches` to a Delta Lake table for a normal table with `table_name`. Returns - /// an updated [`DeltaTable`] version if the file was written successfully, otherwise returns - /// [`ModelarDbStorageError`]. - pub async fn write_record_batches_to_normal_table( + /// Write `record_batches` to the table with `table_name` in the Delta Lake. The correct + /// writer is selected automatically based on the table type. Returns an updated [`DeltaTable`] + /// if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. + pub async fn write_record_batches( &self, table_name: &str, record_batches: Vec, ) -> Result { - let delta_table = self.delta_table(table_name).await?; - let delta_table_writer = self.normal_or_metadata_table_writer(delta_table).await?; + let delta_table_writer = self.table_writer(table_name).await?; self.write_record_batches_to_table(delta_table_writer, record_batches) .await } - /// Write `compressed_segments` to a Delta Lake table for a time series table with `table_name`. - /// Returns an updated [`DeltaTable`] if the file was written successfully, otherwise returns - /// [`ModelarDbStorageError`]. - pub async fn write_compressed_segments_to_time_series_table( - &self, - table_name: &str, - compressed_segments: Vec, - ) -> Result { - let delta_table = self.delta_table(table_name).await?; - let delta_table_writer = self.time_series_table_writer(delta_table).await?; - self.write_record_batches_to_table(delta_table_writer, compressed_segments) - .await - } - /// Write `record_batches` to the `delta_table_writer` and commit. Returns an updated /// [`DeltaTable`] if all `record_batches` are written and committed successfully, otherwise it /// rolls back all writes done using `delta_table_writer` and returns [`ModelarDbStorageError`]. From 4232217ff473688d2b6dba55d674c79dac9f66a8 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:48:10 +0100 Subject: [PATCH 22/48] Rename normal_or_metadata_table_writer to normal_table_writer --- crates/modelardb_storage/src/data_folder/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index e0ea832b..af5bf088 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -478,7 +478,7 @@ impl DataFolder { if self.is_time_series_table(table_name).await? { self.time_series_table_writer(delta_table).await } else { - self.normal_or_metadata_table_writer(delta_table).await + self.normal_table_writer(delta_table).await } } @@ -514,10 +514,10 @@ impl DataFolder { DeltaTableWriter::try_new(delta_table, partition_columns, writer_properties) } - /// Return a [`DeltaTableWriter`] for writing to the table corresponding to `delta_table` in the - /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be - /// established or the table does not exist. - async fn normal_or_metadata_table_writer( + /// Return a [`DeltaTableWriter`] for writing to the normal table corresponding to `delta_table` + /// in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot + /// be established or the table does not exist. + async fn normal_table_writer( &self, delta_table: DeltaTable, ) -> Result { @@ -863,7 +863,7 @@ impl DataFolder { ) -> Result { let delta_table = self.metadata_delta_table(table_name).await?; let record_batch = RecordBatch::try_new(TableProvider::schema(&delta_table), columns)?; - let delta_table_writer = self.normal_or_metadata_table_writer(delta_table).await?; + let delta_table_writer = self.normal_table_writer(delta_table).await?; self.write_record_batches_to_table(delta_table_writer, vec![record_batch]) .await } From 5588a67e2da4b4f7031c596cf2c5b04b19f56f92 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:19:14 +0100 Subject: [PATCH 23/48] Remove unnecessary if statement --- .../src/storage/data_transfer.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 2a4be5ff..cad3ff90 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -244,19 +244,9 @@ impl DataTransfer { debug!("Transferring {current_size_in_bytes} bytes for the table '{table_name}'.",); // Write the data to the remote Delta Lake. - if self - .local_data_folder - .is_time_series_table(table_name) - .await? - { - self.remote_data_folder - .write_record_batches(table_name, record_batches) - .await?; - } else { - self.remote_data_folder - .write_record_batches(table_name, record_batches) - .await?; - } + self.remote_data_folder + .write_record_batches(table_name, record_batches) + .await?; // Delete the data that has been transferred to the remote Delta Lake. self.local_data_folder.truncate_table(table_name).await?; From 731e387413f7ee7ba79cc076b21f5a5772a370b2 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:20:55 +0100 Subject: [PATCH 24/48] Add try_new method to simplify constructor methods --- .../modelardb_storage/src/data_folder/mod.rs | 61 +++++++------------ 1 file changed, 22 insertions(+), 39 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index af5bf088..846b9aa2 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -101,17 +101,12 @@ impl DataFolder { /// Create a new [`DataFolder`] that manages the Delta tables in memory. pub async fn open_memory() -> Result { - let data_folder = Self { - location: "memory:///modelardb".to_owned(), - storage_options: HashMap::new(), - object_store: Arc::new(InMemory::new()), - delta_table_cache: DashMap::new(), - session_context: Arc::new(crate::create_session_context()), - }; - - data_folder.create_and_register_metadata_tables().await?; - - Ok(data_folder) + Self::try_new( + "memory:///modelardb".to_owned(), + HashMap::new(), + Arc::new(InMemory::new()), + ) + .await } /// Create a new [`DataFolder`] that manages the Delta tables in `data_folder_path`. Returns a @@ -132,17 +127,7 @@ impl DataFolder { .ok_or_else(|| DeltaTableError::generic("Local data folder path is not UTF-8."))? .to_owned(); - let data_folder = Self { - location, - storage_options: HashMap::new(), - object_store: Arc::new(object_store), - delta_table_cache: DashMap::new(), - session_context: Arc::new(crate::create_session_context()), - }; - - data_folder.create_and_register_metadata_tables().await?; - - Ok(data_folder) + Self::try_new(location, HashMap::new(), Arc::new(object_store)).await } /// Create a new [`DataFolder`] that manages Delta tables in the remote object store given by @@ -221,17 +206,7 @@ impl DataFolder { ) .build()?; - let data_folder = DataFolder { - location, - storage_options, - object_store: Arc::new(object_store), - delta_table_cache: DashMap::new(), - session_context: Arc::new(crate::create_session_context()), - }; - - data_folder.create_and_register_metadata_tables().await?; - - Ok(data_folder) + Self::try_new(location, storage_options, Arc::new(object_store)).await } /// Create a new [`DataFolder`] that manages the Delta tables in an object store with an @@ -254,10 +229,21 @@ impl DataFolder { .map_err(|error| ModelarDbStorageError::InvalidArgument(error.to_string()))?; let (object_store, _path) = object_store::parse_url_opts(&url, &storage_options)?; - let data_folder = DataFolder { + Self::try_new(location, storage_options, Arc::new(object_store)).await + } + + /// Create a new [`DataFolder`] with the given `location`, `storage_options`, and `object_store`, + /// create the metadata tables, and return the [`DataFolder`]. Returns [`ModelarDbStorageError`] + /// if the metadata tables cannot be created. + async fn try_new( + location: String, + storage_options: HashMap, + object_store: Arc, + ) -> Result { + let data_folder = Self { location, storage_options, - object_store: Arc::new(object_store), + object_store, delta_table_cache: DashMap::new(), session_context: Arc::new(crate::create_session_context()), }; @@ -517,10 +503,7 @@ impl DataFolder { /// Return a [`DeltaTableWriter`] for writing to the normal table corresponding to `delta_table` /// in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot /// be established or the table does not exist. - async fn normal_table_writer( - &self, - delta_table: DeltaTable, - ) -> Result { + async fn normal_table_writer(&self, delta_table: DeltaTable) -> Result { let writer_properties = apache_parquet_writer_properties(None); DeltaTableWriter::try_new(delta_table, vec![], writer_properties) } From ada942615b2ec22cdab47b16763f1b0a196c9597 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:36:41 +0100 Subject: [PATCH 25/48] Move DeltaTableWriter constructors into DeltaTableWriter struct --- .../src/data_folder/delta_table_writer.rs | 39 ++++++++++++++ .../modelardb_storage/src/data_folder/mod.rs | 51 ++----------------- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index bb4eb3e4..a0fd445b 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use arrow::array::RecordBatch; use arrow::datatypes::Schema; use datafusion::catalog::TableProvider; +use datafusion::parquet::file::metadata::SortingColumn; use datafusion::parquet::file::properties::WriterProperties; use delta_kernel::table_properties::DataSkippingNumIndexedCols; use deltalake::DeltaTable; @@ -30,10 +31,12 @@ use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; use deltalake::kernel::{Action, Add}; use deltalake::operations::write::writer::{DeltaWriter, WriterConfig}; use deltalake::protocol::{DeltaOperation, SaveMode}; +use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; use object_store::ObjectStore; use object_store::path::Path; use uuid::Uuid; +use crate::apache_parquet_writer_properties; use crate::error::{ModelarDbStorageError, Result}; /// Functionality for transactionally writing [`RecordBatches`](RecordBatch) to a Delta table stored @@ -52,6 +55,42 @@ pub struct DeltaTableWriter { } impl DeltaTableWriter { + /// Create a [`DeltaTableWriter`] configured for writing to a normal table. + pub(crate) fn try_new_for_normal_table(delta_table: DeltaTable) -> Result { + let writer_properties = apache_parquet_writer_properties(None); + Self::try_new(delta_table, vec![], writer_properties) + } + + /// Create a [`DeltaTableWriter`] configured for writing to a time series table. + pub(crate) fn try_new_for_time_series_table(delta_table: DeltaTable) -> Result { + let partition_columns = vec![FIELD_COLUMN.to_owned()]; + + // Specify that the file must be sorted by the tag columns and then by start_time. + let base_compressed_schema_len = COMPRESSED_SCHEMA.0.fields().len(); + let compressed_schema_len = TableProvider::schema(&delta_table).fields().len(); + let sorting_columns_len = (compressed_schema_len - base_compressed_schema_len) + 1; + let mut sorting_columns = Vec::with_capacity(sorting_columns_len); + + // Compressed segments have the tag columns at the end of the schema. + for tag_column_index in base_compressed_schema_len..compressed_schema_len { + sorting_columns.push(SortingColumn { + column_idx: tag_column_index as i32, + descending: false, + nulls_first: false, + }); + } + + // Compressed segments store the first timestamp in the second column. + sorting_columns.push(SortingColumn { + column_idx: 1, + descending: false, + nulls_first: false, + }); + + let writer_properties = apache_parquet_writer_properties(Some(sorting_columns)); + Self::try_new(delta_table, partition_columns, writer_properties) + } + /// Create a new [`DeltaTableWriter`]. Returns a [`ModelarDbStorageError`] if the state of the /// Delta table cannot be loaded from `delta_table`. pub fn try_new( diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 846b9aa2..c4066035 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -39,12 +39,11 @@ use datafusion_proto::bytes::Serializeable; use delta_kernel::engine::arrow_conversion::TryIntoKernel; use deltalake::kernel::StructField; use deltalake::operations::create::CreateBuilder; -use deltalake::parquet::file::metadata::SortingColumn; use deltalake::protocol::SaveMode; use deltalake::{DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_types::functions::{try_convert_bytes_to_schema, try_convert_schema_to_bytes}; -use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; +use modelardb_types::schemas::FIELD_COLUMN; use modelardb_types::types::{ ArrowValue, ErrorBound, GeneratedColumn, MAX_RETENTION_PERIOD_IN_SECONDS, TimeSeriesTableMetadata, @@ -59,7 +58,7 @@ use url::Url; use crate::data_folder::delta_table_writer::DeltaTableWriter; use crate::error::{ModelarDbStorageError, Result}; use crate::query::normal_table::NormalTable; -use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, sql_and_concat}; +use crate::{METADATA_FOLDER, TABLE_FOLDER, sql_and_concat}; /// Types of tables supported by ModelarDB. enum TableType { @@ -462,52 +461,12 @@ impl DataFolder { pub async fn table_writer(&self, table_name: &str) -> Result { let delta_table = self.delta_table(table_name).await?; if self.is_time_series_table(table_name).await? { - self.time_series_table_writer(delta_table).await + DeltaTableWriter::try_new_for_time_series_table(delta_table) } else { - self.normal_table_writer(delta_table).await + DeltaTableWriter::try_new_for_normal_table(delta_table) } } - /// Return a [`DeltaTableWriter`] for writing to the time series table corresponding to - /// `delta_table` in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta - /// Lake cannot be established or the table does not exist. - async fn time_series_table_writer(&self, delta_table: DeltaTable) -> Result { - let partition_columns = vec![FIELD_COLUMN.to_owned()]; - - // Specify that the file must be sorted by the tag columns and then by start_time. - let base_compressed_schema_len = COMPRESSED_SCHEMA.0.fields().len(); - let compressed_schema_len = TableProvider::schema(&delta_table).fields().len(); - let sorting_columns_len = (compressed_schema_len - base_compressed_schema_len) + 1; - let mut sorting_columns = Vec::with_capacity(sorting_columns_len); - - // Compressed segments have the tag columns at the end of the schema. - for tag_column_index in base_compressed_schema_len..compressed_schema_len { - sorting_columns.push(SortingColumn { - column_idx: tag_column_index as i32, - descending: false, - nulls_first: false, - }); - } - - // Compressed segments store the first timestamp in the second column. - sorting_columns.push(SortingColumn { - column_idx: 1, - descending: false, - nulls_first: false, - }); - - let writer_properties = apache_parquet_writer_properties(Some(sorting_columns)); - DeltaTableWriter::try_new(delta_table, partition_columns, writer_properties) - } - - /// Return a [`DeltaTableWriter`] for writing to the normal table corresponding to `delta_table` - /// in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot - /// be established or the table does not exist. - async fn normal_table_writer(&self, delta_table: DeltaTable) -> Result { - let writer_properties = apache_parquet_writer_properties(None); - DeltaTableWriter::try_new(delta_table, vec![], writer_properties) - } - /// Create a Delta Lake table for a metadata table with `table_name` and `schema` and register /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists, it is /// reused. Return [`ModelarDbStorageError`] if the table cannot be created or cannot be @@ -846,7 +805,7 @@ impl DataFolder { ) -> Result { let delta_table = self.metadata_delta_table(table_name).await?; let record_batch = RecordBatch::try_new(TableProvider::schema(&delta_table), columns)?; - let delta_table_writer = self.normal_table_writer(delta_table).await?; + let delta_table_writer = DeltaTableWriter::try_new_for_normal_table(delta_table)?; self.write_record_batches_to_table(delta_table_writer, vec![record_batch]) .await } From 02572c3a18d9b8c7116a6fe7efffc7bf2ca36f8b Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:44:17 +0100 Subject: [PATCH 26/48] Move write_record_batches_to_table to DeltaTableWriter --- .../src/data_folder/delta_table_writer.rs | 16 ++++++++++++ .../modelardb_storage/src/data_folder/mod.rs | 25 +++++-------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index a0fd445b..89318496 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -162,6 +162,22 @@ impl DeltaTableWriter { Ok(()) } + /// Write all `record_batches` and commit. If writing fails, roll back all writes and return + /// [`ModelarDbStorageError`]. Returns the updated [`DeltaTable`] if all `record_batches` are + /// written and committed successfully. + pub async fn write_all_and_commit( + mut self, + record_batches: &[RecordBatch], + ) -> Result { + match self.write_all(record_batches).await { + Ok(_) => self.commit().await, + Err(error) => { + self.rollback().await?; + Err(error) + } + } + } + /// Consume the [`DeltaTableWriter`], finish the writing, and commit the files that have been /// written to the log. If an error occurs before the commit is finished, the already written /// files are deleted if possible. Returns a [`ModelarDbStorageError`] if an error occurs when diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index c4066035..75583ab7 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -806,7 +806,9 @@ impl DataFolder { let delta_table = self.metadata_delta_table(table_name).await?; let record_batch = RecordBatch::try_new(TableProvider::schema(&delta_table), columns)?; let delta_table_writer = DeltaTableWriter::try_new_for_normal_table(delta_table)?; - self.write_record_batches_to_table(delta_table_writer, vec![record_batch]) + + delta_table_writer + .write_all_and_commit(&[record_batch]) .await } @@ -819,25 +821,10 @@ impl DataFolder { record_batches: Vec, ) -> Result { let delta_table_writer = self.table_writer(table_name).await?; - self.write_record_batches_to_table(delta_table_writer, record_batches) - .await - } - /// Write `record_batches` to the `delta_table_writer` and commit. Returns an updated - /// [`DeltaTable`] if all `record_batches` are written and committed successfully, otherwise it - /// rolls back all writes done using `delta_table_writer` and returns [`ModelarDbStorageError`]. - async fn write_record_batches_to_table( - &self, - mut delta_table_writer: DeltaTableWriter, - record_batches: Vec, - ) -> Result { - match delta_table_writer.write_all(&record_batches).await { - Ok(_) => delta_table_writer.commit().await, - Err(error) => { - delta_table_writer.rollback().await?; - Err(error) - } - } + delta_table_writer + .write_all_and_commit(&record_batches) + .await } /// Return the [`TimeSeriesTableMetadata`] of each time series table currently in the metadata From 9e3be194535926175a08c123c35244dfe7ad798d Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:19:03 +0100 Subject: [PATCH 27/48] Reordering methods in DataFolder to group related methods --- .../modelardb_storage/src/data_folder/mod.rs | 370 +++++++++--------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 75583ab7..d2b9959d 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -299,6 +299,43 @@ impl DataFolder { Ok(()) } + /// Create a Delta Lake table for a metadata table with `table_name` and `schema` and register + /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists, it is + /// reused. Return [`ModelarDbStorageError`] if the table cannot be created or cannot be + /// registered. + async fn create_and_register_metadata_table( + &self, + table_name: &str, + schema: &Schema, + ) -> Result<()> { + let delta_table = self + .create_delta_lake_table( + table_name, + schema, + &[], + self.location_of_metadata_table(table_name), + SaveMode::Ignore, + ) + .await?; + + let table_reference = TableReference::partial("metadata", table_name); + let metadata_table = Arc::new(NormalTable::new(delta_table, None)); + self.session_context + .register_table(table_reference, metadata_table)?; + + Ok(()) + } + + /// Return the session context used to query the tables using Apache DataFusion. + pub fn session_context(&self) -> &SessionContext { + &self.session_context + } + + /// Return an [`ObjectStore`] to access the root of the Delta Lake. + pub fn object_store(&self) -> Arc { + self.object_store.clone() + } + /// Register all normal tables and time series tables in `self` with its [`SessionContext`]. /// `data_sink` is set as the [`DataSink`] for all of the tables. If the tables could not be /// registered, [`ModelarDbStorageError`] is returned. @@ -330,175 +367,6 @@ impl DataFolder { Ok(()) } - /// Return the session context used to query the tables using Apache DataFusion. - pub fn session_context(&self) -> &SessionContext { - &self.session_context - } - - /// Return an [`ObjectStore`] to access the root of the Delta Lake. - pub fn object_store(&self) -> Arc { - self.object_store.clone() - } - - /// Return a [`DeltaTable`] for manipulating the metadata table with `table_name` in the - /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be - /// established or the table does not exist. - async fn metadata_delta_table(&self, table_name: &str) -> Result { - let table_path = self.location_of_metadata_table(table_name); - self.delta_table_from_path(&table_path).await - } - - /// Return a [`DeltaTable`] for manipulating the table with `table_name` in the Delta Lake, or a - /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the - /// table does not exist. - pub async fn delta_table(&self, table_name: &str) -> Result { - let table_path = self.location_of_table(table_name); - self.delta_table_from_path(&table_path).await - } - - /// Return a [`DeltaTable`] for manipulating the table at `table_path` in the Delta Lake, or a - /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the - /// table does not exist. - async fn delta_table_from_path(&self, table_path: &str) -> Result { - // Use the cache if possible and load to get the latest table data. - if let Some(mut delta_table) = self.delta_table_cache.get_mut(table_path) { - delta_table.load().await?; - Ok(delta_table.clone()) - } else { - // If the table is not in the cache, open it and add it to the cache before returning. - let table_url = deltalake::ensure_table_uri(table_path)?; - let delta_table = - deltalake::open_table_with_storage_options(table_url, self.storage_options.clone()) - .await?; - - self.delta_table_cache - .insert(table_path.to_owned(), delta_table.clone()); - - Ok(delta_table) - } - } - - /// Return `true` if the table with `table_name` is a normal table, otherwise return `false`. - pub async fn is_normal_table(&self, table_name: &str) -> Result { - Ok(self - .normal_table_names() - .await? - .contains(&table_name.to_owned())) - } - - /// Return `true` if the table with `table_name` is a time series table, otherwise return `false`. - pub async fn is_time_series_table(&self, table_name: &str) -> Result { - Ok(self - .time_series_table_names() - .await? - .contains(&table_name.to_owned())) - } - - /// Return the name of each table currently in the Delta Lake. If the table names cannot be - /// retrieved, [`ModelarDbStorageError`] is returned. - pub async fn table_names(&self) -> Result> { - let normal_table_names = self.normal_table_names().await?; - let time_series_table_names = self.time_series_table_names().await?; - - let mut table_names = normal_table_names; - table_names.extend(time_series_table_names); - - Ok(table_names) - } - - /// Return the name of each normal table currently in the Delta Lake. Note that this does not - /// include time series tables. If the normal table names cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn normal_table_names(&self) -> Result> { - self.table_names_of_type(TableType::NormalTable).await - } - - /// Return the schema of the table with the name in `table_name` if it is a normal table. If the - /// table does not exist or the table is not a normal table, return [`None`]. - pub async fn normal_table_schema(&self, table_name: &str) -> Option> { - if self - .is_normal_table(table_name) - .await - .is_ok_and(|is_normal_table| is_normal_table) - { - let schema = self - .delta_table(table_name) - .await - .expect("Delta Lake table should exist if the metadata is in the Delta Lake.") - .schema(); - - Some(schema) - } else { - None - } - } - - /// Return the name of each time series table currently in the Delta Lake. Note that this does - /// not include normal tables. If the time series table names cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn time_series_table_names(&self) -> Result> { - self.table_names_of_type(TableType::TimeSeriesTable).await - } - - /// Return the name of tables of `table_type`. Returns [`ModelarDbStorageError`] if the table - /// names cannot be retrieved. - async fn table_names_of_type(&self, table_type: TableType) -> Result> { - let table_type = match table_type { - TableType::NormalTable => "normal_table", - TableType::TimeSeriesTable => "time_series_table", - }; - - let sql = format!("SELECT table_name FROM metadata.{table_type}_metadata"); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let table_names = modelardb_types::array!(batch, 0, StringArray); - Ok(table_names.iter().flatten().map(str::to_owned).collect()) - } - - /// Return a [`DeltaTableWriter`] for writing to the table with `table_name` in the Delta Lake, - /// or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or - /// the table does not exist. - pub async fn table_writer(&self, table_name: &str) -> Result { - let delta_table = self.delta_table(table_name).await?; - if self.is_time_series_table(table_name).await? { - DeltaTableWriter::try_new_for_time_series_table(delta_table) - } else { - DeltaTableWriter::try_new_for_normal_table(delta_table) - } - } - - /// Create a Delta Lake table for a metadata table with `table_name` and `schema` and register - /// it in the [`SessionContext`] in the `metadata` schema. If the table already exists, it is - /// reused. Return [`ModelarDbStorageError`] if the table cannot be created or cannot be - /// registered. - async fn create_and_register_metadata_table( - &self, - table_name: &str, - schema: &Schema, - ) -> Result<()> { - let delta_table = self - .create_delta_lake_table( - table_name, - schema, - &[], - self.location_of_metadata_table(table_name), - SaveMode::Ignore, - ) - .await?; - - let table_reference = TableReference::partial("metadata", table_name); - let metadata_table = Arc::new(NormalTable::new(delta_table, None)); - self.session_context - .register_table(table_reference, metadata_table)?; - - Ok(()) - } - - /// Return the location of the metadata table with `table_name`. - fn location_of_metadata_table(&self, table_name: &str) -> String { - format!("{}/{METADATA_FOLDER}/{table_name}", self.location) - } - /// Create a Delta Lake table for a normal table with `table_name` and `schema` and save the /// table metadata to the `normal_table_metadata` table. If the table already exists or /// the metadata could not be saved, return [`ModelarDbStorageError`], otherwise return @@ -560,11 +428,6 @@ impl DataFolder { Ok(delta_table) } - /// Return the location of the table with `table_name`. - fn location_of_table(&self, table_name: &str) -> String { - format!("{}/{TABLE_FOLDER}/{table_name}", self.location) - } - /// Save the created time series table to the Delta Lake. This includes adding a row to the /// `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` /// table for each field column. @@ -796,6 +659,33 @@ impl DataFolder { Ok(()) } + /// Return a [`DeltaTableWriter`] for writing to the table with `table_name` in the Delta Lake, + /// or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or + /// the table does not exist. + pub async fn table_writer(&self, table_name: &str) -> Result { + let delta_table = self.delta_table(table_name).await?; + if self.is_time_series_table(table_name).await? { + DeltaTableWriter::try_new_for_time_series_table(delta_table) + } else { + DeltaTableWriter::try_new_for_normal_table(delta_table) + } + } + + /// Write `record_batches` to the table with `table_name` in the Delta Lake. The correct + /// writer is selected automatically based on the table type. Returns an updated [`DeltaTable`] + /// if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. + pub async fn write_record_batches( + &self, + table_name: &str, + record_batches: Vec, + ) -> Result { + let delta_table_writer = self.table_writer(table_name).await?; + + delta_table_writer + .write_all_and_commit(&record_batches) + .await + } + /// Write `columns` to a Delta Lake table with `table_name`. Returns an updated [`DeltaTable`] /// version if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. async fn write_columns_to_metadata_table( @@ -812,19 +702,129 @@ impl DataFolder { .await } - /// Write `record_batches` to the table with `table_name` in the Delta Lake. The correct - /// writer is selected automatically based on the table type. Returns an updated [`DeltaTable`] - /// if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. - pub async fn write_record_batches( - &self, - table_name: &str, - record_batches: Vec, - ) -> Result { - let delta_table_writer = self.table_writer(table_name).await?; + /// Return a [`DeltaTable`] for manipulating the metadata table with `table_name` in the + /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be + /// established or the table does not exist. + async fn metadata_delta_table(&self, table_name: &str) -> Result { + let table_path = self.location_of_metadata_table(table_name); + self.delta_table_from_path(&table_path).await + } - delta_table_writer - .write_all_and_commit(&record_batches) + /// Return a [`DeltaTable`] for manipulating the table with `table_name` in the Delta Lake, or a + /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the + /// table does not exist. + pub async fn delta_table(&self, table_name: &str) -> Result { + let table_path = self.location_of_table(table_name); + self.delta_table_from_path(&table_path).await + } + + /// Return a [`DeltaTable`] for manipulating the table at `table_path` in the Delta Lake, or a + /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the + /// table does not exist. + async fn delta_table_from_path(&self, table_path: &str) -> Result { + // Use the cache if possible and load to get the latest table data. + if let Some(mut delta_table) = self.delta_table_cache.get_mut(table_path) { + delta_table.load().await?; + Ok(delta_table.clone()) + } else { + // If the table is not in the cache, open it and add it to the cache before returning. + let table_url = deltalake::ensure_table_uri(table_path)?; + let delta_table = + deltalake::open_table_with_storage_options(table_url, self.storage_options.clone()) + .await?; + + self.delta_table_cache + .insert(table_path.to_owned(), delta_table.clone()); + + Ok(delta_table) + } + } + + /// Return `true` if the table with `table_name` is a normal table, otherwise return `false`. + pub async fn is_normal_table(&self, table_name: &str) -> Result { + Ok(self + .normal_table_names() + .await? + .contains(&table_name.to_owned())) + } + + /// Return `true` if the table with `table_name` is a time series table, otherwise return `false`. + pub async fn is_time_series_table(&self, table_name: &str) -> Result { + Ok(self + .time_series_table_names() + .await? + .contains(&table_name.to_owned())) + } + + /// Return the name of each table currently in the Delta Lake. If the table names cannot be + /// retrieved, [`ModelarDbStorageError`] is returned. + pub async fn table_names(&self) -> Result> { + let normal_table_names = self.normal_table_names().await?; + let time_series_table_names = self.time_series_table_names().await?; + + let mut table_names = normal_table_names; + table_names.extend(time_series_table_names); + + Ok(table_names) + } + + /// Return the name of each normal table currently in the Delta Lake. Note that this does not + /// include time series tables. If the normal table names cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn normal_table_names(&self) -> Result> { + self.table_names_of_type(TableType::NormalTable).await + } + + /// Return the name of each time series table currently in the Delta Lake. Note that this does + /// not include normal tables. If the time series table names cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn time_series_table_names(&self) -> Result> { + self.table_names_of_type(TableType::TimeSeriesTable).await + } + + /// Return the name of tables of `table_type`. Returns [`ModelarDbStorageError`] if the table + /// names cannot be retrieved. + async fn table_names_of_type(&self, table_type: TableType) -> Result> { + let table_type = match table_type { + TableType::NormalTable => "normal_table", + TableType::TimeSeriesTable => "time_series_table", + }; + + let sql = format!("SELECT table_name FROM metadata.{table_type}_metadata"); + let batch = sql_and_concat(&self.session_context, &sql).await?; + + let table_names = modelardb_types::array!(batch, 0, StringArray); + Ok(table_names.iter().flatten().map(str::to_owned).collect()) + } + + /// Return the schema of the table with the name in `table_name` if it is a normal table. If the + /// table does not exist or the table is not a normal table, return [`None`]. + pub async fn normal_table_schema(&self, table_name: &str) -> Option> { + if self + .is_normal_table(table_name) .await + .is_ok_and(|is_normal_table| is_normal_table) + { + let schema = self + .delta_table(table_name) + .await + .expect("Delta Lake table should exist if the metadata is in the Delta Lake.") + .schema(); + + Some(schema) + } else { + None + } + } + + /// Return the location of the metadata table with `table_name`. + fn location_of_metadata_table(&self, table_name: &str) -> String { + format!("{}/{METADATA_FOLDER}/{table_name}", self.location) + } + + /// Return the location of the table with `table_name`. + fn location_of_table(&self, table_name: &str) -> String { + format!("{}/{TABLE_FOLDER}/{table_name}", self.location) } /// Return the [`TimeSeriesTableMetadata`] of each time series table currently in the metadata From d1d2945606475a673d32f4b802f3f49eca7695c3 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:24:31 +0100 Subject: [PATCH 28/48] Reorder tests to match new method ordering --- .../modelardb_storage/src/data_folder/mod.rs | 172 +++++++++--------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index d2b9959d..d62eec6f 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1041,82 +1041,6 @@ mod tests { .is_ok()); } - #[tokio::test] - async fn test_normal_table_is_normal_table() { - let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - assert!(data_folder.is_normal_table("normal_table_1").await.unwrap()); - } - - #[tokio::test] - async fn test_time_series_table_is_not_normal_table() { - let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; - assert!( - !data_folder - .is_normal_table(test::TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_time_series_table_is_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; - assert!( - data_folder - .is_time_series_table(test::TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_normal_table_is_not_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - assert!( - !data_folder - .is_time_series_table("normal_table_1") - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - - let time_series_table_metadata = test::time_series_table_metadata(); - data_folder - .create_time_series_table(&time_series_table_metadata) - .await - .unwrap(); - - let table_names = data_folder.table_names().await.unwrap(); - assert_eq!( - table_names, - vec![ - "normal_table_2", - "normal_table_1", - test::TIME_SERIES_TABLE_NAME - ] - ); - } - - #[tokio::test] - async fn test_normal_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - - let normal_table_names = data_folder.normal_table_names().await.unwrap(); - assert_eq!(normal_table_names, vec!["normal_table_2", "normal_table_1"]); - } - - #[tokio::test] - async fn test_time_series_table_names() { - let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; - - let time_series_table_names = data_folder.time_series_table_names().await.unwrap(); - assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); - } - #[tokio::test] async fn test_create_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; @@ -1248,22 +1172,80 @@ mod tests { assert!(data_folder.drop_table("missing_table").await.is_err()); } - async fn create_data_folder_and_create_normal_tables() -> (TempDir, DataFolder) { - let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + #[tokio::test] + async fn test_normal_table_is_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + assert!(data_folder.is_normal_table("normal_table_1").await.unwrap()); + } - let normal_table_schema = test::normal_table_schema(); - data_folder - .create_normal_table("normal_table_1", &normal_table_schema) - .await - .unwrap(); + #[tokio::test] + async fn test_time_series_table_is_not_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + assert!( + !data_folder + .is_normal_table(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap() + ); + } + #[tokio::test] + async fn test_time_series_table_is_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + assert!( + data_folder + .is_time_series_table(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap() + ); + } + + #[tokio::test] + async fn test_normal_table_is_not_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + assert!( + !data_folder + .is_time_series_table("normal_table_1") + .await + .unwrap() + ); + } + + #[tokio::test] + async fn test_table_names() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let time_series_table_metadata = test::time_series_table_metadata(); data_folder - .create_normal_table("normal_table_2", &normal_table_schema) + .create_time_series_table(&time_series_table_metadata) .await .unwrap(); - (temp_dir, data_folder) + let table_names = data_folder.table_names().await.unwrap(); + assert_eq!( + table_names, + vec![ + "normal_table_2", + "normal_table_1", + test::TIME_SERIES_TABLE_NAME + ] + ); + } + + #[tokio::test] + async fn test_normal_table_names() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let normal_table_names = data_folder.normal_table_names().await.unwrap(); + assert_eq!(normal_table_names, vec!["normal_table_2", "normal_table_1"]); + } + + #[tokio::test] + async fn test_time_series_table_names() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + let time_series_table_names = data_folder.time_series_table_names().await.unwrap(); + assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); } #[tokio::test] @@ -1391,6 +1373,24 @@ mod tests { ); } + async fn create_data_folder_and_create_normal_tables() -> (TempDir, DataFolder) { + let temp_dir = tempfile::tempdir().unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + + let normal_table_schema = test::normal_table_schema(); + data_folder + .create_normal_table("normal_table_1", &normal_table_schema) + .await + .unwrap(); + + data_folder + .create_normal_table("normal_table_2", &normal_table_schema) + .await + .unwrap(); + + (temp_dir, data_folder) + } + async fn create_data_folder_and_create_time_series_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); From f5e7e9dd7f29e1fcbe96ed9f5f7131dacfba8ab7 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:33:01 +0100 Subject: [PATCH 29/48] Add simple tests for open_local_url and open_memory --- .../modelardb_storage/src/data_folder/mod.rs | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index d62eec6f..c06b7fb0 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1012,6 +1012,43 @@ mod tests { use tempfile::TempDir; // Tests for DataFolder. + #[tokio::test] + async fn test_open_local_url_without_scheme() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().to_str().unwrap(); + + let data_folder = DataFolder::open_local_url(path).await; + assert!(data_folder.is_ok()); + } + + #[tokio::test] + async fn test_open_local_url_with_file_scheme() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().to_str().unwrap(); + let url = format!("file://{path}"); + + let data_folder = DataFolder::open_local_url(&url).await; + assert!(data_folder.is_ok()); + } + + #[tokio::test] + async fn test_open_local_url_with_memory_scheme() { + let data_folder = DataFolder::open_local_url("memory://").await; + assert!(data_folder.is_ok()); + } + + #[tokio::test] + async fn test_open_local_url_with_invalid_scheme() { + let data_folder = DataFolder::open_local_url("invalid://path").await; + assert!(data_folder.is_err()); + } + + #[tokio::test] + async fn test_open_memory() { + let data_folder = DataFolder::open_memory().await; + assert!(data_folder.is_ok()); + } + #[tokio::test] async fn test_create_metadata_data_folder_tables() { let temp_dir = tempfile::tempdir().unwrap(); From 02aca5e7171e4ec1e7a5865b81a543935ab1a33e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:46:22 +0100 Subject: [PATCH 30/48] Add tests for duplicates and uint error checks --- .../modelardb_storage/src/data_folder/mod.rs | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index c06b7fb0..f8018d1c 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1096,6 +1096,20 @@ mod tests { ); } + #[tokio::test] + async fn test_create_existing_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let result = data_folder + .create_normal_table("normal_table_1", &test::normal_table_schema()) + .await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Generic error: A Delta Lake table already exists at that location." + ); + } + #[tokio::test] async fn test_create_time_series_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; @@ -1152,6 +1166,41 @@ mod tests { ); } + #[tokio::test] + async fn test_create_existing_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + let result = data_folder + .create_time_series_table(&test::time_series_table_metadata()) + .await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Generic error: A Delta Lake table already exists at that location." + ); + } + + #[tokio::test] + async fn test_create_table_with_unsigned_integers() { + let temp_dir = tempfile::tempdir().unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + + let schema = Schema::new(vec![ + Field::new("id", DataType::UInt32, false), + Field::new("name", DataType::Utf8, false), + ]); + + let result = data_folder + .create_normal_table("uint_table", &schema) + .await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Data does not match the schema or partitions of the table: \ + Unsigned integers are not supported." + ); + } + #[tokio::test] async fn test_drop_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; From b19af7963a7c9db75e21d78be20c13259ec28d27 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:49:42 +0100 Subject: [PATCH 31/48] Check for specific error message in test_open_local_url_with_invalid_scheme --- crates/modelardb_storage/src/data_folder/mod.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index f8018d1c..9b877cec 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1039,8 +1039,13 @@ mod tests { #[tokio::test] async fn test_open_local_url_with_invalid_scheme() { - let data_folder = DataFolder::open_local_url("invalid://path").await; - assert!(data_folder.is_err()); + let url = "invalid://path"; + let result = DataFolder::open_local_url(url).await; + + assert_eq!( + result.err().unwrap().to_string(), + format!("Invalid Argument Error: {url} is not a valid local URL.") + ); } #[tokio::test] @@ -1190,9 +1195,7 @@ mod tests { Field::new("name", DataType::Utf8, false), ]); - let result = data_folder - .create_normal_table("uint_table", &schema) - .await; + let result = data_folder.create_normal_table("uint_table", &schema).await; assert_eq!( result.unwrap_err().to_string(), From f748fc49d44d28c241837f0b731cd4bc36ee1574 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:04:31 +0100 Subject: [PATCH 32/48] Check for specific error message in other DataFolder tests --- crates/modelardb_storage/src/data_folder/mod.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 9b877cec..69333d68 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1258,7 +1258,13 @@ mod tests { async fn test_drop_missing_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; - assert!(data_folder.drop_table("missing_table").await.is_err()); + let result = data_folder.drop_table("missing_table").await; + + assert_eq!( + result.unwrap_err().to_string(), + "Invalid Argument Error: Table with name 'missing_table' does not exist." + ); + } } #[tokio::test] @@ -1368,11 +1374,14 @@ mod tests { async fn test_time_series_table_metadata_for_missing_time_series_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; - let time_series_table_metadata = data_folder + let result = data_folder .time_series_table_metadata_for_time_series_table("missing_table") .await; - assert!(time_series_table_metadata.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "Invalid Argument Error: No metadata for time series table named 'missing_table'." + ); } #[tokio::test] From 5e2a29005f88633097cbcebcfe2fe7df7c8e64f4 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:11:00 +0100 Subject: [PATCH 33/48] Add tests for truncate_table --- .../modelardb_storage/src/data_folder/mod.rs | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 69333d68..01929709 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1265,6 +1265,57 @@ mod tests { "Invalid Argument Error: Table with name 'missing_table' does not exist." ); } + + #[tokio::test] + async fn test_truncate_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let mut delta_table = data_folder + .write_record_batches("normal_table_1", vec![test::normal_table_record_batch()]) + .await + .unwrap(); + + assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + data_folder.truncate_table("normal_table_1").await.unwrap(); + + delta_table.load().await.unwrap(); + assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + } + + #[tokio::test] + async fn test_truncate_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + let mut delta_table = data_folder + .write_record_batches( + test::TIME_SERIES_TABLE_NAME, + vec![test::compressed_segments_record_batch()], + ) + .await + .unwrap(); + + assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + data_folder + .truncate_table(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap(); + + delta_table.load().await.unwrap(); + assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + } + + #[tokio::test] + async fn test_truncate_missing_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let result = data_folder.truncate_table("missing_table").await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Not a Delta table: Generic delta kernel error: No files in log segment" + ); } #[tokio::test] From 03dd063ec5b68e5581aa59af34a14ae440553952 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:22:11 +0100 Subject: [PATCH 34/48] Add tests for vacuum_table --- .../modelardb_storage/src/data_folder/mod.rs | 119 +++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 01929709..566f780f 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -19,9 +19,9 @@ pub mod cluster; pub mod delta_table_writer; use std::collections::HashMap; +use std::env; use std::path::Path as StdPath; use std::sync::Arc; -use std::{env, fs}; use arrow::array::{ ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Float32Array, Int16Array, RecordBatch, @@ -113,7 +113,7 @@ impl DataFolder { /// the metadata tables cannot be created. pub async fn open_local(data_folder_path: &StdPath) -> Result { // Ensure the directories in the path exists as LocalFileSystem otherwise returns an error. - fs::create_dir_all(data_folder_path) + std::fs::create_dir_all(data_folder_path) .map_err(|error| DeltaTableError::generic(error.to_string()))?; // Use with_automatic_cleanup to ensure empty directories are deleted automatically. @@ -1318,6 +1318,121 @@ mod tests { ); } + #[tokio::test] + async fn test_vacuum_normal_table() { + let (temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + data_folder + .write_record_batches("normal_table_1", vec![test::normal_table_record_batch()]) + .await + .unwrap(); + + data_folder.truncate_table("normal_table_1").await.unwrap(); + + // The stale Parquet file should still exist on disk alongside the _delta_log folder. + let table_path = format!( + "{}/tables/normal_table_1", + temp_dir.path().to_str().unwrap() + ); + assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 2); + + data_folder + .vacuum_table("normal_table_1", Some(0)) + .await + .unwrap(); + + // Only the _delta_log folder should remain. + assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); + } + + #[tokio::test] + async fn test_vacuum_time_series_table() { + let (temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + data_folder + .write_record_batches( + test::TIME_SERIES_TABLE_NAME, + vec![test::compressed_segments_record_batch()], + ) + .await + .unwrap(); + + data_folder + .truncate_table(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap(); + + // The stale Parquet file should still exist in the partition folder. + let column_path = format!( + "{}/tables/{}/field_column=0", + temp_dir.path().to_str().unwrap(), + test::TIME_SERIES_TABLE_NAME + ); + assert_eq!(std::fs::read_dir(&column_path).unwrap().count(), 1); + + data_folder + .vacuum_table(test::TIME_SERIES_TABLE_NAME, Some(0)) + .await + .unwrap(); + + // The stale Parquet file should have been removed. + assert_eq!(std::fs::read_dir(&column_path).unwrap().count(), 0); + } + + #[tokio::test] + async fn test_vacuum_table_with_default_retention_period() { + let (temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + data_folder + .write_record_batches("normal_table_1", vec![test::normal_table_record_batch()]) + .await + .unwrap(); + + data_folder.truncate_table("normal_table_1").await.unwrap(); + + // Vacuum with None uses the default 7-day retention period, so the recently + // truncated file is not yet stale and should still exist on disk. + data_folder + .vacuum_table("normal_table_1", None) + .await + .unwrap(); + + let table_path = format!( + "{}/tables/normal_table_1", + temp_dir.path().to_str().unwrap() + ); + assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 2); + } + + #[tokio::test] + async fn test_vacuum_table_with_out_of_bounds_retention_period() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let result = data_folder + .vacuum_table("normal_table_1", Some(MAX_RETENTION_PERIOD_IN_SECONDS + 1)) + .await; + + assert_eq!( + result.unwrap_err().to_string(), + format!( + "Invalid Argument Error: \ + Retention period cannot be more than {MAX_RETENTION_PERIOD_IN_SECONDS} seconds." + ) + ); + } + + #[tokio::test] + async fn test_vacuum_missing_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let result = data_folder.vacuum_table("missing_table", None).await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Not a Delta table: Generic delta kernel error: No files in log segment" + ); + } + #[tokio::test] async fn test_normal_table_is_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; From 855fece586f0481233c75ef75955b98d3fbc2704 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:48:36 +0100 Subject: [PATCH 35/48] Add tests for write_record_batches --- .../modelardb_storage/src/data_folder/mod.rs | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 566f780f..d13940c7 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1433,6 +1433,110 @@ mod tests { ); } + #[tokio::test] + async fn test_write_record_batches_to_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let batch_to_write = test::normal_table_record_batch(); + let delta_table = data_folder + .write_record_batches("normal_table_1", vec![batch_to_write.clone()]) + .await + .unwrap(); + + // Verify the write produced a Parquet file. + assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + // Read the data back and verify the content. + data_folder + .session_context + .register_table("normal_table_1", Arc::new(delta_table)) + .unwrap(); + + let read_batch = + sql_and_concat(&data_folder.session_context, "SELECT * FROM normal_table_1") + .await + .unwrap(); + + assert_eq!(read_batch, batch_to_write); + } + + #[tokio::test] + async fn test_write_record_batches_to_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + let batch_to_write = test::compressed_segments_record_batch(); + let delta_table = data_folder + .write_record_batches(test::TIME_SERIES_TABLE_NAME, vec![batch_to_write.clone()]) + .await + .unwrap(); + + // Verify the write produced a Parquet file. + assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + // Read the data back and verify the content. The partition column (field_column) is + // moved to the end by Delta Lake, so SELECT the columns in the original schema order. + let schema = test::time_series_table_metadata().compressed_schema.clone(); + let column_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + + data_folder + .session_context + .register_table(test::TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) + .unwrap(); + + let read_batch = sql_and_concat( + &data_folder.session_context, + &format!( + "SELECT {} FROM {}", + column_names.join(", "), + test::TIME_SERIES_TABLE_NAME + ), + ) + .await + .unwrap(); + + assert_eq!(read_batch, batch_to_write); + } + + #[tokio::test] + async fn test_write_empty_vec_to_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let delta_table = data_folder + .write_record_batches("normal_table_1", vec![]) + .await + .unwrap(); + + assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + } + + #[tokio::test] + async fn test_write_empty_record_batch_to_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let empty_batch = RecordBatch::new_empty(Arc::new(test::normal_table_schema())); + let delta_table = data_folder + .write_record_batches("normal_table_1", vec![empty_batch]) + .await + .unwrap(); + + assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + } + + #[tokio::test] + async fn test_write_record_batches_to_missing_table() { + let temp_dir = tempfile::tempdir().unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + + let result = data_folder + .write_record_batches("missing_table", vec![test::normal_table_record_batch()]) + .await; + + assert_eq!( + result.unwrap_err().to_string(), + "Delta Lake Error: Not a Delta table: Generic delta kernel error: No files in log segment" + ); + } + #[tokio::test] async fn test_normal_table_is_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; From f970886b0642cb6c54fcb7b8afb772bdcf11d3a7 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 11:55:27 +0100 Subject: [PATCH 36/48] Add tests for normal_table_schema() --- .../modelardb_storage/src/data_folder/mod.rs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index d13940c7..b902bb68 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1613,6 +1613,27 @@ mod tests { assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); } + #[tokio::test] + async fn test_normal_table_schema_for_existing_normal_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let schema = data_folder + .normal_table_schema("normal_table_1") + .await + .unwrap(); + + assert_eq!(schema.as_ref(), &test::normal_table_schema()); + } + + #[tokio::test] + async fn test_normal_table_schema_for_missing_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + let maybe_schema = data_folder.normal_table_schema("missing_table").await; + + assert!(maybe_schema.is_none()); + } + #[tokio::test] async fn test_time_series_table_metadata() { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; From bcf1c23077cb020be4584ea3d99549713ffed423 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:17:11 +0100 Subject: [PATCH 37/48] Move NoOpDataSink to modelardb_test crate --- Cargo.lock | 2 + .../src/optimizer/model_simple_aggregates.rs | 49 +------------------ crates/modelardb_test/Cargo.toml | 2 + crates/modelardb_test/src/table.rs | 46 +++++++++++++++++ 4 files changed, 52 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 899f7205..aced4a3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3403,6 +3403,8 @@ name = "modelardb_test" version = "0.1.0" dependencies = [ "arrow", + "async-trait", + "datafusion", "modelardb_types", "rand 0.9.2", ] diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 7b408227..8fd4bba6 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -621,64 +621,19 @@ impl Accumulator for ModelAvgAccumulator { mod tests { use super::*; - use std::any::{Any, TypeId}; - use std::fmt::{Debug, Formatter, Result as FmtResult}; + use std::any::TypeId; - use datafusion::arrow::datatypes::Schema; - use datafusion::datasource::sink::DataSink; - use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_plan::aggregates::AggregateExec; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::filter::FilterExec; - use datafusion::physical_plan::metrics::MetricsSet; - use datafusion::physical_plan::{DisplayAs, DisplayFormatType}; - use modelardb_test::table::{self, TIME_SERIES_TABLE_NAME}; + use modelardb_test::table::{self, NoOpDataSink, TIME_SERIES_TABLE_NAME}; use tempfile::TempDir; - use tonic::async_trait; use crate::data_folder::DataFolder; use crate::query::grid_exec::GridExec; use crate::query::time_series_table::TimeSeriesTable; - // DataSink for testing. - struct NoOpDataSink {} - - #[async_trait] - impl DataSink for NoOpDataSink { - fn as_any(&self) -> &dyn Any { - unimplemented!(); - } - - fn schema(&self) -> &Arc { - unimplemented!(); - } - - fn metrics(&self) -> Option { - unimplemented!(); - } - - async fn write_all( - &self, - _data: SendableRecordBatchStream, - _context: &Arc, - ) -> DataFusionResult { - unimplemented!(); - } - } - - impl Debug for NoOpDataSink { - fn fmt(&self, _f: &mut Formatter<'_>) -> FmtResult { - unimplemented!(); - } - } - - impl DisplayAs for NoOpDataSink { - fn fmt_as(&self, _t: DisplayFormatType, _f: &mut Formatter<'_>) -> FmtResult { - unimplemented!(); - } - } - // Tests for ModelSimpleAggregates. #[tokio::test] async fn test_rewrite_aggregate_on_one_column_without_predicates() { diff --git a/crates/modelardb_test/Cargo.toml b/crates/modelardb_test/Cargo.toml index 368cb4ef..2ddc5065 100644 --- a/crates/modelardb_test/Cargo.toml +++ b/crates/modelardb_test/Cargo.toml @@ -21,5 +21,7 @@ authors = ["Soeren Kejser Jensen "] [dependencies] arrow.workspace = true +async-trait.workspace = true +datafusion.workspace = true modelardb_types = { path = "../modelardb_types" } rand.workspace = true diff --git a/crates/modelardb_test/src/table.rs b/crates/modelardb_test/src/table.rs index 56fccb6e..3c2a0f68 100644 --- a/crates/modelardb_test/src/table.rs +++ b/crates/modelardb_test/src/table.rs @@ -15,10 +15,18 @@ //! Implementation of table related functions and constants used throughout ModelarDB for testing purposes. +use std::any::Any; +use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::sync::Arc; use arrow::array::{BinaryArray, Float32Array, Int8Array, Int16Array, RecordBatch, StringArray}; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; +use async_trait::async_trait; +use datafusion::datasource::sink::DataSink; +use datafusion::error::Result as DataFusionResult; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::metrics::MetricsSet; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType}; use modelardb_types::types::{ ArrowTimestamp, ArrowValue, ErrorBound, TimeSeriesTableMetadata, Timestamp, TimestampArray, Value, ValueArray, @@ -39,6 +47,44 @@ pub const TIME_SERIES_TABLE_SQL: &str = "CREATE TIME SERIES TABLE time_series_ta /// Name of the time series table used in tests. pub const TIME_SERIES_TABLE_NAME: &str = "time_series_table"; +/// [`DataSink`] implementation that does nothing. +pub struct NoOpDataSink {} + +#[async_trait] +impl DataSink for NoOpDataSink { + fn as_any(&self) -> &dyn Any { + unimplemented!(); + } + + fn metrics(&self) -> Option { + unimplemented!(); + } + + fn schema(&self) -> &Arc { + unimplemented!(); + } + + async fn write_all( + &self, + _data: SendableRecordBatchStream, + _context: &Arc, + ) -> DataFusionResult { + unimplemented!(); + } +} + +impl Debug for NoOpDataSink { + fn fmt(&self, _f: &mut Formatter<'_>) -> FmtResult { + unimplemented!(); + } +} + +impl DisplayAs for NoOpDataSink { + fn fmt_as(&self, _t: DisplayFormatType, _f: &mut Formatter<'_>) -> FmtResult { + unimplemented!(); + } +} + /// Return protobuf message bytes containing metadata for a normal table. pub fn normal_table_metadata_protobuf_bytes() -> Vec { modelardb_types::flight::encode_and_serialize_normal_table_metadata( From 2939ceb199e160ffdb4714791b9425504572b6ee Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:20:16 +0100 Subject: [PATCH 38/48] Add test for register_tables() --- .../modelardb_storage/src/data_folder/mod.rs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index b902bb68..2fd85ae5 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1008,6 +1008,7 @@ mod tests { use datafusion::common::ScalarValue::Int64; use datafusion::logical_expr::Expr::Literal; use modelardb_test::table as test; + use modelardb_test::table::NoOpDataSink; use modelardb_types::types::ArrowTimestamp; use tempfile::TempDir; @@ -1083,6 +1084,37 @@ mod tests { .is_ok()); } + #[tokio::test] + async fn test_register_tables() { + let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; + + data_folder + .create_time_series_table(&test::time_series_table_metadata()) + .await + .unwrap(); + + data_folder + .register_tables(Arc::new(NoOpDataSink {})) + .await + .unwrap(); + + // Verify the tables are queryable through the session context. + assert!( + data_folder + .session_context + .table_provider("normal_table_1") + .await + .is_ok() + ); + assert!( + data_folder + .session_context + .table_provider(test::TIME_SERIES_TABLE_NAME) + .await + .is_ok() + ); + } + #[tokio::test] async fn test_create_normal_table() { let (_temp_dir, data_folder) = create_data_folder_and_create_normal_tables().await; From d48939c3b6d18dacb4e7aaf0840082f5950f5c5a Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:24:29 +0100 Subject: [PATCH 39/48] Add tests for time_series_table_metadata_for_registered_time_series_table --- .../modelardb_storage/src/data_folder/mod.rs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 2fd85ae5..5aac8195 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1707,6 +1707,38 @@ mod tests { ); } + #[tokio::test] + async fn test_time_series_table_metadata_for_registered_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + data_folder + .register_tables(Arc::new(NoOpDataSink {})) + .await + .unwrap(); + + let metadata = data_folder + .time_series_table_metadata_for_registered_time_series_table( + test::TIME_SERIES_TABLE_NAME, + ) + .await; + + assert_eq!( + metadata.unwrap().as_ref(), + &test::time_series_table_metadata(), + ); + } + + #[tokio::test] + async fn test_time_series_table_metadata_for_unregistered_time_series_table() { + let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; + + let metadata = data_folder + .time_series_table_metadata_for_registered_time_series_table("missing_table") + .await; + + assert!(metadata.is_none()); + } + #[tokio::test] async fn test_error_bound() { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; From 9b05d2241a4571516d4160754c8cda812e4a5eb4 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:23:24 +0100 Subject: [PATCH 40/48] Add tests to DeltaTableWriter --- .../src/data_folder/delta_table_writer.rs | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index 89318496..a3cc9cff 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -248,3 +248,169 @@ async fn delete_added_files(object_store: &dyn ObjectStore, added_files: Vec (TempDir, DataFolder) { + let temp_dir = tempfile::tempdir().unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + + data_folder + .create_normal_table(NORMAL_TABLE_NAME, &test::normal_table_schema()) + .await + .unwrap(); + + (temp_dir, data_folder) + } + + async fn create_data_folder_with_time_series_table() -> (TempDir, DataFolder) { + let temp_dir = tempfile::tempdir().unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + + data_folder + .create_time_series_table(&test::time_series_table_metadata()) + .await + .unwrap(); + + (temp_dir, data_folder) + } +} From 5503c2131ef7b4d844d57d39c4cd888768c61415 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:37:46 +0100 Subject: [PATCH 41/48] Add test for empty write and better assertions --- .../src/data_folder/delta_table_writer.rs | 63 ++++++++++++++++++- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index a3cc9cff..d0a94557 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -258,6 +258,7 @@ mod tests { use tempfile::TempDir; use crate::data_folder::DataFolder; + use crate::sql_and_concat; // Tests for DeltaTableWriter. #[tokio::test] @@ -290,6 +291,19 @@ mod tests { assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); } + #[tokio::test] + async fn test_write_empty_record_batch() { + let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; + let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); + let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + + let empty_batch = RecordBatch::new_empty(Arc::new(test::normal_table_schema())); + writer.write(&empty_batch).await.unwrap(); + + let delta_table = writer.commit().await.unwrap(); + assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + } + #[tokio::test] async fn test_write_with_schema_mismatch() { let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; @@ -321,6 +335,21 @@ mod tests { .unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + // Verify both batches were written. + data_folder + .session_context() + .register_table(NORMAL_TABLE_NAME, Arc::new(delta_table)) + .unwrap(); + + let result = sql_and_concat( + data_folder.session_context(), + &format!("SELECT * FROM {NORMAL_TABLE_NAME}"), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 10); } #[tokio::test] @@ -339,11 +368,26 @@ mod tests { .unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); + + // Verify both batches were written. + data_folder + .session_context() + .register_table(test::TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) + .unwrap(); + + let result = sql_and_concat( + data_folder.session_context(), + &format!("SELECT * FROM {}", test::TIME_SERIES_TABLE_NAME), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 6); } #[tokio::test] async fn test_write_all_and_commit_rolls_back_on_error() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; + let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); @@ -360,9 +404,15 @@ mod tests { .starts_with("Delta Lake Error: Attempted to write invalid data to the table:") ); - // Verify the rollback left no committed files. let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + + // Verify the physical files were cleaned up. Only _delta_log should remain. + let table_path = format!( + "{}/tables/{NORMAL_TABLE_NAME}", + temp_dir.path().to_str().unwrap() + ); + assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); } #[tokio::test] @@ -377,7 +427,7 @@ mod tests { #[tokio::test] async fn test_rollback() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; + let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); @@ -388,6 +438,13 @@ mod tests { let delta_table = writer.rollback().await.unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); + + // Verify the physical files were cleaned up. Only _delta_log should remain. + let table_path = format!( + "{}/tables/{NORMAL_TABLE_NAME}", + temp_dir.path().to_str().unwrap() + ); + assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); } async fn create_data_folder_with_normal_table() -> (TempDir, DataFolder) { From 3445eca15203906081478d203cc7cdbc60f4e7d0 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:45:54 +0100 Subject: [PATCH 42/48] Add test utility function to create_normal_table_writer --- .../src/data_folder/delta_table_writer.rs | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index d0a94557..0d7c62cf 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -278,9 +278,7 @@ mod tests { #[tokio::test] async fn test_write_and_commit() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; writer .write(&test::normal_table_record_batch()) @@ -293,9 +291,7 @@ mod tests { #[tokio::test] async fn test_write_empty_record_batch() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; let empty_batch = RecordBatch::new_empty(Arc::new(test::normal_table_schema())); writer.write(&empty_batch).await.unwrap(); @@ -306,9 +302,7 @@ mod tests { #[tokio::test] async fn test_write_with_schema_mismatch() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; let result = writer .write(&test::compressed_segments_record_batch()) @@ -324,9 +318,7 @@ mod tests { #[tokio::test] async fn test_write_all_and_commit_to_normal_table() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (_temp_dir, data_folder, writer) = create_normal_table_writer().await; let batch = test::normal_table_record_batch(); let delta_table = writer @@ -387,9 +379,7 @@ mod tests { #[tokio::test] async fn test_write_all_and_commit_rolls_back_on_error() { - let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (temp_dir, data_folder, writer) = create_normal_table_writer().await; let valid_batch = test::normal_table_record_batch(); let invalid_batch = test::compressed_segments_record_batch(); @@ -417,9 +407,7 @@ mod tests { #[tokio::test] async fn test_commit_without_writes() { - let (_temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (_temp_dir, _data_folder, writer) = create_normal_table_writer().await; let delta_table = writer.commit().await.unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); @@ -427,9 +415,7 @@ mod tests { #[tokio::test] async fn test_rollback() { - let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let mut writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + let (temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; writer .write(&test::normal_table_record_batch()) @@ -447,6 +433,14 @@ mod tests { assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); } + async fn create_normal_table_writer() -> (TempDir, DataFolder, DeltaTableWriter) { + let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; + let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); + let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); + + (temp_dir, data_folder, writer) + } + async fn create_data_folder_with_normal_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); From b98447d4f475823a115a0525fcf58b5088a06ab8 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 14:02:06 +0100 Subject: [PATCH 43/48] Use time series tables in the DeltaTableWriter tests --- .../src/data_folder/delta_table_writer.rs | 108 ++++++------------ 1 file changed, 36 insertions(+), 72 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index 0d7c62cf..a3073be0 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -254,7 +254,7 @@ mod tests { use super::*; use modelardb_test::table as test; - use modelardb_test::table::NORMAL_TABLE_NAME; + use modelardb_test::table::{NORMAL_TABLE_NAME, TIME_SERIES_TABLE_NAME}; use tempfile::TempDir; use crate::data_folder::DataFolder; @@ -278,10 +278,10 @@ mod tests { #[tokio::test] async fn test_write_and_commit() { - let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; + let (_temp_dir, _data_folder, mut writer) = create_time_series_table_writer().await; writer - .write(&test::normal_table_record_batch()) + .write(&test::compressed_segments_record_batch()) .await .unwrap(); @@ -291,9 +291,10 @@ mod tests { #[tokio::test] async fn test_write_empty_record_batch() { - let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; + let (_temp_dir, _data_folder, mut writer) = create_time_series_table_writer().await; - let empty_batch = RecordBatch::new_empty(Arc::new(test::normal_table_schema())); + let empty_batch = + RecordBatch::new_empty(test::time_series_table_metadata().compressed_schema); writer.write(&empty_batch).await.unwrap(); let delta_table = writer.commit().await.unwrap(); @@ -302,11 +303,9 @@ mod tests { #[tokio::test] async fn test_write_with_schema_mismatch() { - let (_temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; + let (_temp_dir, _data_folder, mut writer) = create_time_series_table_writer().await; - let result = writer - .write(&test::compressed_segments_record_batch()) - .await; + let result = writer.write(&test::normal_table_record_batch()).await; assert!( result @@ -317,41 +316,8 @@ mod tests { } #[tokio::test] - async fn test_write_all_and_commit_to_normal_table() { - let (_temp_dir, data_folder, writer) = create_normal_table_writer().await; - - let batch = test::normal_table_record_batch(); - let delta_table = writer - .write_all_and_commit(&[batch.clone(), batch]) - .await - .unwrap(); - - assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); - - // Verify both batches were written. - data_folder - .session_context() - .register_table(NORMAL_TABLE_NAME, Arc::new(delta_table)) - .unwrap(); - - let result = sql_and_concat( - data_folder.session_context(), - &format!("SELECT * FROM {NORMAL_TABLE_NAME}"), - ) - .await - .unwrap(); - - assert_eq!(result.num_rows(), 10); - } - - #[tokio::test] - async fn test_write_all_and_commit_to_time_series_table() { - let (_temp_dir, data_folder) = create_data_folder_with_time_series_table().await; - let delta_table = data_folder - .delta_table(test::TIME_SERIES_TABLE_NAME) - .await - .unwrap(); - let writer = DeltaTableWriter::try_new_for_time_series_table(delta_table).unwrap(); + async fn test_write_all_and_commit() { + let (_temp_dir, data_folder, writer) = create_time_series_table_writer().await; let batch = test::compressed_segments_record_batch(); let delta_table = writer @@ -364,12 +330,12 @@ mod tests { // Verify both batches were written. data_folder .session_context() - .register_table(test::TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) + .register_table(TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) .unwrap(); let result = sql_and_concat( data_folder.session_context(), - &format!("SELECT * FROM {}", test::TIME_SERIES_TABLE_NAME), + &format!("SELECT * FROM {TIME_SERIES_TABLE_NAME}"), ) .await .unwrap(); @@ -379,10 +345,10 @@ mod tests { #[tokio::test] async fn test_write_all_and_commit_rolls_back_on_error() { - let (temp_dir, data_folder, writer) = create_normal_table_writer().await; + let (temp_dir, data_folder, writer) = create_time_series_table_writer().await; - let valid_batch = test::normal_table_record_batch(); - let invalid_batch = test::compressed_segments_record_batch(); + let valid_batch = test::compressed_segments_record_batch(); + let invalid_batch = test::normal_table_record_batch(); let result = writer .write_all_and_commit(&[valid_batch, invalid_batch]) .await; @@ -394,20 +360,24 @@ mod tests { .starts_with("Delta Lake Error: Attempted to write invalid data to the table:") ); - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); + // Verify no commit was made. + let delta_table = data_folder + .delta_table(TIME_SERIES_TABLE_NAME) + .await + .unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); - // Verify the physical files were cleaned up. Only _delta_log should remain. - let table_path = format!( - "{}/tables/{NORMAL_TABLE_NAME}", + // Verify the physical files were cleaned up from the partition folder. + let column_path = format!( + "{}/tables/{TIME_SERIES_TABLE_NAME}/field_column=0", temp_dir.path().to_str().unwrap() ); - assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); + assert_eq!(std::fs::read_dir(&column_path).unwrap().count(), 0); } #[tokio::test] async fn test_commit_without_writes() { - let (_temp_dir, _data_folder, writer) = create_normal_table_writer().await; + let (_temp_dir, _data_folder, writer) = create_time_series_table_writer().await; let delta_table = writer.commit().await.unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); @@ -415,30 +385,22 @@ mod tests { #[tokio::test] async fn test_rollback() { - let (temp_dir, _data_folder, mut writer) = create_normal_table_writer().await; + let (temp_dir, _data_folder, mut writer) = create_time_series_table_writer().await; writer - .write(&test::normal_table_record_batch()) + .write(&test::compressed_segments_record_batch()) .await .unwrap(); let delta_table = writer.rollback().await.unwrap(); assert_eq!(delta_table.get_file_uris().unwrap().count(), 0); - // Verify the physical files were cleaned up. Only _delta_log should remain. - let table_path = format!( - "{}/tables/{NORMAL_TABLE_NAME}", + // Verify the physical files were cleaned up from the partition folder. + let column_path = format!( + "{}/tables/{TIME_SERIES_TABLE_NAME}/field_column=0", temp_dir.path().to_str().unwrap() ); - assert_eq!(std::fs::read_dir(&table_path).unwrap().count(), 1); - } - - async fn create_normal_table_writer() -> (TempDir, DataFolder, DeltaTableWriter) { - let (temp_dir, data_folder) = create_data_folder_with_normal_table().await; - let delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); - let writer = DeltaTableWriter::try_new_for_normal_table(delta_table).unwrap(); - - (temp_dir, data_folder, writer) + assert_eq!(std::fs::read_dir(&column_path).unwrap().count(), 0); } async fn create_data_folder_with_normal_table() -> (TempDir, DataFolder) { @@ -453,15 +415,17 @@ mod tests { (temp_dir, data_folder) } - async fn create_data_folder_with_time_series_table() -> (TempDir, DataFolder) { + async fn create_time_series_table_writer() -> (TempDir, DataFolder, DeltaTableWriter) { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); - data_folder + let delta_table = data_folder .create_time_series_table(&test::time_series_table_metadata()) .await .unwrap(); - (temp_dir, data_folder) + let writer = DeltaTableWriter::try_new_for_time_series_table(delta_table).unwrap(); + + (temp_dir, data_folder, writer) } } From 81d5f9919761e496e458aab4626d622b61b6cfda Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 14:55:56 +0100 Subject: [PATCH 44/48] Import TIME_SERIES_TABLE_NAME directly --- .../modelardb_storage/src/data_folder/mod.rs | 57 ++++++++----------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 5aac8195..1b8fef55 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -1008,7 +1008,7 @@ mod tests { use datafusion::common::ScalarValue::Int64; use datafusion::logical_expr::Expr::Literal; use modelardb_test::table as test; - use modelardb_test::table::NoOpDataSink; + use modelardb_test::table::{NoOpDataSink, TIME_SERIES_TABLE_NAME}; use modelardb_types::types::ArrowTimestamp; use tempfile::TempDir; @@ -1109,7 +1109,7 @@ mod tests { assert!( data_folder .session_context - .table_provider(test::TIME_SERIES_TABLE_NAME) + .table_provider(TIME_SERIES_TABLE_NAME) .await .is_ok() ); @@ -1153,7 +1153,7 @@ mod tests { assert!( data_folder - .delta_table(test::TIME_SERIES_TABLE_NAME) + .delta_table(TIME_SERIES_TABLE_NAME) .await .is_ok() ); @@ -1166,7 +1166,7 @@ mod tests { assert_eq!( **batch.column(0), - StringArray::from(vec![test::TIME_SERIES_TABLE_NAME]) + StringArray::from(vec![TIME_SERIES_TABLE_NAME]) ); assert_eq!( **batch.column(1), @@ -1185,10 +1185,7 @@ mod tests { assert_eq!( **batch.column(0), - StringArray::from(vec![ - test::TIME_SERIES_TABLE_NAME, - test::TIME_SERIES_TABLE_NAME - ]) + StringArray::from(vec![TIME_SERIES_TABLE_NAME, TIME_SERIES_TABLE_NAME]) ); assert_eq!( **batch.column(1), @@ -1258,13 +1255,13 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; data_folder - .drop_table(test::TIME_SERIES_TABLE_NAME) + .drop_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); assert!( data_folder - .delta_table(test::TIME_SERIES_TABLE_NAME) + .delta_table(TIME_SERIES_TABLE_NAME) .await .is_err() ); @@ -1321,7 +1318,7 @@ mod tests { let mut delta_table = data_folder .write_record_batches( - test::TIME_SERIES_TABLE_NAME, + TIME_SERIES_TABLE_NAME, vec![test::compressed_segments_record_batch()], ) .await @@ -1330,7 +1327,7 @@ mod tests { assert_eq!(delta_table.get_file_uris().unwrap().count(), 1); data_folder - .truncate_table(test::TIME_SERIES_TABLE_NAME) + .truncate_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -1383,14 +1380,14 @@ mod tests { data_folder .write_record_batches( - test::TIME_SERIES_TABLE_NAME, + TIME_SERIES_TABLE_NAME, vec![test::compressed_segments_record_batch()], ) .await .unwrap(); data_folder - .truncate_table(test::TIME_SERIES_TABLE_NAME) + .truncate_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -1398,12 +1395,12 @@ mod tests { let column_path = format!( "{}/tables/{}/field_column=0", temp_dir.path().to_str().unwrap(), - test::TIME_SERIES_TABLE_NAME + TIME_SERIES_TABLE_NAME ); assert_eq!(std::fs::read_dir(&column_path).unwrap().count(), 1); data_folder - .vacuum_table(test::TIME_SERIES_TABLE_NAME, Some(0)) + .vacuum_table(TIME_SERIES_TABLE_NAME, Some(0)) .await .unwrap(); @@ -1498,7 +1495,7 @@ mod tests { let batch_to_write = test::compressed_segments_record_batch(); let delta_table = data_folder - .write_record_batches(test::TIME_SERIES_TABLE_NAME, vec![batch_to_write.clone()]) + .write_record_batches(TIME_SERIES_TABLE_NAME, vec![batch_to_write.clone()]) .await .unwrap(); @@ -1512,7 +1509,7 @@ mod tests { data_folder .session_context - .register_table(test::TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) + .register_table(TIME_SERIES_TABLE_NAME, Arc::new(delta_table)) .unwrap(); let read_batch = sql_and_concat( @@ -1520,7 +1517,7 @@ mod tests { &format!( "SELECT {} FROM {}", column_names.join(", "), - test::TIME_SERIES_TABLE_NAME + TIME_SERIES_TABLE_NAME ), ) .await @@ -1580,7 +1577,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; assert!( !data_folder - .is_normal_table(test::TIME_SERIES_TABLE_NAME) + .is_normal_table(TIME_SERIES_TABLE_NAME) .await .unwrap() ); @@ -1591,7 +1588,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; assert!( data_folder - .is_time_series_table(test::TIME_SERIES_TABLE_NAME) + .is_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap() ); @@ -1621,11 +1618,7 @@ mod tests { let table_names = data_folder.table_names().await.unwrap(); assert_eq!( table_names, - vec![ - "normal_table_2", - "normal_table_1", - test::TIME_SERIES_TABLE_NAME - ] + vec!["normal_table_2", "normal_table_1", TIME_SERIES_TABLE_NAME] ); } @@ -1642,7 +1635,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_names = data_folder.time_series_table_names().await.unwrap(); - assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); + assert_eq!(time_series_table_names, vec![TIME_SERIES_TABLE_NAME]); } #[tokio::test] @@ -1683,7 +1676,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let time_series_table_metadata = data_folder - .time_series_table_metadata_for_time_series_table(test::TIME_SERIES_TABLE_NAME) + .time_series_table_metadata_for_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -1717,9 +1710,7 @@ mod tests { .unwrap(); let metadata = data_folder - .time_series_table_metadata_for_registered_time_series_table( - test::TIME_SERIES_TABLE_NAME, - ) + .time_series_table_metadata_for_registered_time_series_table(TIME_SERIES_TABLE_NAME) .await; assert_eq!( @@ -1733,7 +1724,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let metadata = data_folder - .time_series_table_metadata_for_registered_time_series_table("missing_table") + .time_series_table_metadata_for_registered_time_series_table(TIME_SERIES_TABLE_NAME) .await; assert!(metadata.is_none()); @@ -1744,7 +1735,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_create_time_series_table().await; let error_bounds = data_folder - .error_bounds(test::TIME_SERIES_TABLE_NAME, 4) + .error_bounds(TIME_SERIES_TABLE_NAME, 4) .await .unwrap(); From 0f14a951ea40752c546e6573dff6cdf52c1f40fc Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:10:05 +0100 Subject: [PATCH 45/48] Use time_series_table_metadata_for_registered_time_series_table in table_writer --- crates/modelardb_storage/src/data_folder/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 1b8fef55..9bbd275d 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -664,7 +664,11 @@ impl DataFolder { /// the table does not exist. pub async fn table_writer(&self, table_name: &str) -> Result { let delta_table = self.delta_table(table_name).await?; - if self.is_time_series_table(table_name).await? { + if self + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await + .is_some() + { DeltaTableWriter::try_new_for_time_series_table(delta_table) } else { DeltaTableWriter::try_new_for_normal_table(delta_table) From 5162a66cdc4d8c45c2b5e2dd106fb9d931c3a47a Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:37:06 +0100 Subject: [PATCH 46/48] Use partition column to identify time series tables --- crates/modelardb_storage/src/data_folder/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_storage/src/data_folder/mod.rs b/crates/modelardb_storage/src/data_folder/mod.rs index 9bbd275d..9dc814e8 100644 --- a/crates/modelardb_storage/src/data_folder/mod.rs +++ b/crates/modelardb_storage/src/data_folder/mod.rs @@ -664,11 +664,11 @@ impl DataFolder { /// the table does not exist. pub async fn table_writer(&self, table_name: &str) -> Result { let delta_table = self.delta_table(table_name).await?; - if self - .time_series_table_metadata_for_registered_time_series_table(table_name) - .await - .is_some() - { + let partition_columns = delta_table.snapshot()?.metadata().partition_columns(); + + // If the table is a time series table, the partition column is the field column. + // self.is_time_series_table() is not used to avoid a redundant query to the metadata table. + if partition_columns.contains(&FIELD_COLUMN.to_owned()) { DeltaTableWriter::try_new_for_time_series_table(delta_table) } else { DeltaTableWriter::try_new_for_normal_table(delta_table) From ac33ab271c2cd78b6f50ee07abe1febf027876d1 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 24 Mar 2026 08:01:44 +0100 Subject: [PATCH 47/48] Update based on comment from @chrthomsen --- crates/modelardb_storage/src/data_folder/delta_table_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index a3073be0..f0d5cefa 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -98,7 +98,7 @@ impl DeltaTableWriter { partition_columns: Vec, writer_properties: WriterProperties, ) -> Result { - // Checker for if record batches match the table’s invariants, constraints, and nullability. + // Checks whether record batches match the table’s invariants, constraints, and nullability. let delta_table_state = delta_table.snapshot()?; let snapshot = delta_table_state.snapshot(); let delta_data_checker = DeltaDataChecker::new(snapshot); From dbec39179b8bcb0b2b4ed5fce0d56b63f22e9b61 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 24 Mar 2026 12:18:14 +0100 Subject: [PATCH 48/48] Update based on comment from @skejserjensen --- crates/modelardb_storage/src/data_folder/delta_table_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs index f0d5cefa..8501c4f7 100644 --- a/crates/modelardb_storage/src/data_folder/delta_table_writer.rs +++ b/crates/modelardb_storage/src/data_folder/delta_table_writer.rs @@ -115,7 +115,7 @@ impl DeltaTableWriter { }; // A UUID version 4 is used as the operation id to match the existing Operation trait in the - // deltalake crate as it is pub(trait) and thus cannot be used directly in DeltaTableWriter. + // deltalake crate as it is pub(crate) and thus cannot be used directly in DeltaTableWriter. let operation_id = Uuid::new_v4(); // Writer that will write the record batches.