diff --git a/crates/paimon/src/spec/data_file.rs b/crates/paimon/src/spec/data_file.rs index 37165e6..43b69ce 100644 --- a/crates/paimon/src/spec/data_file.rs +++ b/crates/paimon/src/spec/data_file.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::spec::RowType; +use crate::spec::{BinaryTableStats, RowType}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; @@ -48,24 +48,6 @@ impl BinaryRow { } } -/// TODO: implement me. -/// The statistics for columns, supports the following stats. -/// -/// Impl References: -type SimpleStats = (); - -/// The Source of a file. -/// TODO: move me to the manifest module. -/// -/// Impl References: -#[repr(u8)] -#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub enum FileSource { - Append = 0, - Compact = 1, -} - /// Metadata of a data file. /// /// Impl References: @@ -78,8 +60,8 @@ pub struct DataFileMeta { pub row_count: i64, pub min_key: BinaryRow, pub max_key: BinaryRow, - pub key_stats: SimpleStats, - pub value_stats: SimpleStats, + pub key_stats: Option, + pub value_stats: Option, pub min_sequence_number: i64, pub max_sequence_number: i64, pub schema_id: i64, @@ -90,7 +72,6 @@ pub struct DataFileMeta { pub delete_row_count: Option, // file index filter bytes, if it is small, store in data file meta pub embedded_index: Option>, - pub file_source: Option, } impl Display for DataFileMeta { @@ -100,6 +81,143 @@ impl Display for DataFileMeta { } impl DataFileMeta { - // TODO: implement me pub const SCHEMA: RowType = RowType::new(vec![]); + + /// Get the file name. + pub fn file_name(&self) -> &str { + &self.file_name + } + + /// Get the file size. + pub fn file_size(&self) -> i64 { + self.file_size + } + + /// Get the row count. + pub fn row_count(&self) -> i64 { + self.row_count + } + + /// Get the min key. + pub fn min_key(&self) -> &BinaryRow { + &self.min_key + } + + /// Get the max key. + pub fn max_key(&self) -> &BinaryRow { + &self.max_key + } + + /// Get the key stats. + pub fn key_stats(&self) -> Option<&BinaryTableStats> { + self.key_stats.as_ref() + } + + /// Get the value stats. + pub fn value_stats(&self) -> Option<&BinaryTableStats> { + self.value_stats.as_ref() + } + + /// Get the min sequence number. + pub fn min_sequence_number(&self) -> i64 { + self.min_sequence_number + } + + /// Get the max sequence number. + pub fn max_sequence_number(&self) -> i64 { + self.max_sequence_number + } + + /// Get the schema id. + pub fn schema_id(&self) -> i64 { + self.schema_id + } + + /// Get the level. + pub fn level(&self) -> i32 { + self.level + } + + /// Get the extra files. + pub fn extra_files(&self) -> &[String] { + &self.extra_files + } + + /// Get the creation time. + pub fn creation_time(&self) -> DateTime { + self.creation_time + } + + /// Get the delete row count. + pub fn delete_row_count(&self) -> Option { + self.delete_row_count + } + + /// Get the embedded index. + pub fn embedded_index(&self) -> Option<&[u8]> { + self.embedded_index.as_deref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_data_file_meta_serialize_deserialize() { + let json_data = r#" + { + "fileName":"test.avro", + "fileSize":1024, + "rowCount":100, + "minKey":{ + "arity":1, + "nullBitsSizeInBytes":1 + }, + "maxKey":{ + "arity":10, + "nullBitsSizeInBytes":2 + }, + "keyStats":null, + "valueStats":null, + "minSequenceNumber":0, + "maxSequenceNumber":100, + "schemaId":0, + "level":0, + "extraFiles":[], + "creationTime":"2024-08-13T02:03:03.106490600Z", + "deleteRowCount":5, + "embeddedIndex":null + } + "#; + + let data_file_meta: DataFileMeta = + serde_json::from_str(json_data).expect("Failed to deserialize DataFileMeta"); + + assert_eq!(data_file_meta.file_name, "test.avro"); + assert_eq!(data_file_meta.file_size, 1024); + assert_eq!(data_file_meta.row_count, 100); + + assert_eq!(data_file_meta.min_key.arity, 1); + assert_eq!(data_file_meta.min_key.null_bits_size_in_bytes, 1); + assert_eq!(data_file_meta.max_key.arity, 10); + assert_eq!(data_file_meta.max_key.null_bits_size_in_bytes, 2); + + assert!(data_file_meta.key_stats.is_none()); + assert!(data_file_meta.value_stats.is_none()); + + assert_eq!(data_file_meta.min_sequence_number, 0); + assert_eq!(data_file_meta.max_sequence_number, 100); + assert_eq!(data_file_meta.schema_id, 0); + assert_eq!(data_file_meta.level, 0); + assert_eq!(data_file_meta.extra_files.len(), 0); + assert_eq!( + data_file_meta.creation_time, + DateTime::parse_from_rfc3339("2024-08-13T02:03:03.106490600Z") + .unwrap() + .with_timezone(&Utc) + ); + assert_eq!(data_file_meta.delete_row_count, Some(5)); + assert!(data_file_meta.embedded_index.is_none()); + } } diff --git a/crates/paimon/src/spec/manifest_file_meta.rs b/crates/paimon/src/spec/manifest_file_meta.rs index 382d579..7e48ff1 100644 --- a/crates/paimon/src/spec/manifest_file_meta.rs +++ b/crates/paimon/src/spec/manifest_file_meta.rs @@ -186,3 +186,45 @@ impl Display for BinaryTableStats { todo!() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_manifest_file_meta_serialize_deserialize() { + let data_json = r#" + { + "_VERSION":2, + "_FILE_NAME":"manifest_file_meta.avro", + "_FILE_SIZE":1024, + "_NUM_ADDED_FILES":5, + "_NUM_DELETED_FILES":6, + "_PARTITION_STATS":{"_MIN_VALUES":[0,1,2],"_MAX_VALUES":[3,4,5],"_NULL_COUNTS":[6,7,8]}, + "_SCHEMA_ID":1 + } + "#; + + let manifest_file_meta: ManifestFileMeta = + serde_json::from_str(data_json).expect("Failed to deserialize ManifestFileMeta."); + + assert_eq!(manifest_file_meta.file_name(), "manifest_file_meta.avro"); + assert_eq!(manifest_file_meta.file_size(), 1024); + assert_eq!(manifest_file_meta.num_added_files(), 5); + assert_eq!(manifest_file_meta.num_deleted_files(), 6); + assert_eq!(manifest_file_meta.schema_id(), 1); + assert_eq!(manifest_file_meta.version(), 2); + assert_eq!( + manifest_file_meta.partition_stats().min_values(), + &[0, 1, 2] + ); + assert_eq!( + manifest_file_meta.partition_stats().max_values(), + &[3, 4, 5] + ); + assert_eq!( + manifest_file_meta.partition_stats().null_counts(), + &[6, 7, 8] + ); + } +} diff --git a/crates/paimon/src/spec/schema.rs b/crates/paimon/src/spec/schema.rs index 4266421..58e143e 100644 --- a/crates/paimon/src/spec/schema.rs +++ b/crates/paimon/src/spec/schema.rs @@ -109,65 +109,64 @@ mod tests { use super::*; #[test] - fn test_create_data_field() { - let id = 1; - let name = "field1".to_string(); - let typ = DataType::Int(IntType::new()); - let description = "test description".to_string(); - - let data_field = DataField::new(id, name.clone(), typ.clone()) - .with_description(Some(description.clone())); - - assert_eq!(data_field.id(), id); - assert_eq!(data_field.name(), name); - assert_eq!(data_field.data_type(), &typ); - assert_eq!(data_field.description(), Some(description).as_deref()); - } - - #[test] - fn test_new_id() { - let d_type = DataType::Int(IntType::new()); - let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone()).with_id(2); - - assert_eq!(new_data_field.id(), 2); - assert_eq!(new_data_field.name(), "field1"); - assert_eq!(new_data_field.data_type(), &d_type); - assert_eq!(new_data_field.description(), None); - } - - #[test] - fn test_new_name() { - let d_type = DataType::Int(IntType::new()); - let new_data_field = - DataField::new(1, "field1".to_string(), d_type.clone()).with_name("field2".to_string()); - - assert_eq!(new_data_field.id(), 1); - assert_eq!(new_data_field.name(), "field2"); - assert_eq!(new_data_field.data_type(), &d_type); - assert_eq!(new_data_field.description(), None); - } - - #[test] - fn test_new_description() { - let d_type = DataType::Int(IntType::new()); - let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone()) - .with_description(Some("new description".to_string())); - - assert_eq!(new_data_field.id(), 1); - assert_eq!(new_data_field.name(), "field1"); - assert_eq!(new_data_field.data_type(), &d_type); - assert_eq!(new_data_field.description(), Some("new description")); - } - - #[test] - fn test_escape_identifier() { - let escaped_identifier = escape_identifier("\"identifier\""); - assert_eq!(escaped_identifier, "\"\"identifier\"\""); - } - - #[test] - fn test_escape_single_quotes() { - let escaped_text = escape_single_quotes("text with 'single' quotes"); - assert_eq!(escaped_text, "text with ''single'' quotes"); + fn test_table_schema_serialize_deserialize() { + let json_data = r#" + { + "version" : 2, + "id" : 1, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "INT" + }, { + "id" : 2, + "name" : "f2", + "type" : "INT" + } ], + "highestFieldId" : 10, + "partitionKeys" : [ "f0" ], + "primaryKeys" : [ "f1" ], + "options" : { }, + "comment" : "", + "timeMillis" : 1723440320019 + }"#; + + let table_schema: TableSchema = + serde_json::from_str(json_data).expect("Failed to deserialize TableSchema"); + + assert_eq!(table_schema.version, 2); + assert_eq!(table_schema.id, 1); + assert_eq!(table_schema.highest_field_id, 10); + assert_eq!(table_schema.partition_keys, vec!["f0"]); + assert_eq!(table_schema.primary_keys, vec!["f1"]); + assert_eq!(table_schema.options, HashMap::new()); + assert_eq!(table_schema.comment, Some("".to_string())); + assert_eq!(table_schema.time_millis, 1723440320019); + + assert_eq!(table_schema.fields.len(), 3); + assert_eq!(table_schema.fields[0].id, 0); + assert_eq!(table_schema.fields[0].name, "f0"); + assert_eq!( + table_schema.fields[0].data_type(), + &DataType::Int(IntType::new()) + ); + + assert_eq!(table_schema.fields[1].id, 1); + assert_eq!(table_schema.fields[1].name, "f1"); + assert_eq!( + table_schema.fields[1].data_type(), + &DataType::Int(IntType::new()) + ); + + assert_eq!(table_schema.fields[2].id, 2); + assert_eq!(table_schema.fields[2].name, "f2"); + assert_eq!( + table_schema.fields[2].data_type(), + &DataType::Int(IntType::new()) + ); } }