Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ set(ICEBERG_SOURCES
manifest/manifest_entry.cc
manifest/manifest_list.cc
manifest/manifest_reader.cc
manifest/manifest_reader_internal.cc
manifest/manifest_writer.cc
manifest/v1_metadata.cc
manifest/v2_metadata.cc
Expand Down
152 changes: 90 additions & 62 deletions src/iceberg/manifest/manifest_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,94 +178,114 @@ struct ICEBERG_EXPORT DataFile {
/// present
std::optional<int64_t> content_size_in_bytes;

static constexpr int32_t kContentFieldId = 134;
inline static const SchemaField kContent = SchemaField::MakeOptional(
134, "content", iceberg::int32(),
kContentFieldId, "content", int32(),
"Contents of the file: 0=data, 1=position deletes, 2=equality deletes");

static constexpr int32_t kFilePathFieldId = 100;
inline static const SchemaField kFilePath = SchemaField::MakeRequired(
100, "file_path", iceberg::string(), "Location URI with FS scheme");
inline static const SchemaField kFileFormat = SchemaField::MakeRequired(
101, "file_format", iceberg::string(), "File format name: avro, orc, or parquet");
inline static const int32_t kPartitionFieldId = 102;
kFilePathFieldId, "file_path", string(), "Location URI with FS scheme");

static constexpr int32_t kFileFormatFieldId = 101;
inline static const SchemaField kFileFormat =
SchemaField::MakeRequired(kFileFormatFieldId, "file_format", string(),
"File format name: avro, orc, or parquet");

static constexpr int32_t kPartitionFieldId = 102;
inline static const std::string kPartitionField = "partition";
inline static const std::string kPartitionDoc =
"Partition data tuple, schema based on the partition spec";

static constexpr int32_t kRecordCountFieldId = 103;
inline static const SchemaField kRecordCount = SchemaField::MakeRequired(
103, "record_count", iceberg::int64(), "Number of records in the file");
kRecordCountFieldId, "record_count", int64(), "Number of records in the file");

static constexpr int32_t kFileSizeFieldId = 104;
inline static const SchemaField kFileSize = SchemaField::MakeRequired(
104, "file_size_in_bytes", iceberg::int64(), "Total file size in bytes");
kFileSizeFieldId, "file_size_in_bytes", int64(), "Total file size in bytes");

static constexpr int32_t kColumnSizesFieldId = 108;
inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
108, "column_sizes",
std::make_shared<MapType>(
SchemaField::MakeRequired(117, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(118, std::string(MapType::kValueName),
iceberg::int64())),
kColumnSizesFieldId, "column_sizes",
map(SchemaField::MakeRequired(117, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(118, std::string(MapType::kValueName), int64())),
"Map of column id to total size on disk");

static constexpr int32_t kValueCountsFieldId = 109;
inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
109, "value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(119, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(120, std::string(MapType::kValueName),
iceberg::int64())),
kValueCountsFieldId, "value_counts",
map(SchemaField::MakeRequired(119, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(120, std::string(MapType::kValueName), int64())),
"Map of column id to total count, including null and NaN");

static constexpr int32_t kNullValueCountsFieldId = 110;
inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
110, "null_value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(121, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(122, std::string(MapType::kValueName),
iceberg::int64())),
kNullValueCountsFieldId, "null_value_counts",
map(SchemaField::MakeRequired(121, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(122, std::string(MapType::kValueName), int64())),
"Map of column id to null value count");

static constexpr int32_t kNanValueCountsFieldId = 137;
inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
137, "nan_value_counts",
std::make_shared<MapType>(
SchemaField::MakeRequired(138, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(139, std::string(MapType::kValueName),
iceberg::int64())),
kNanValueCountsFieldId, "nan_value_counts",
map(SchemaField::MakeRequired(138, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(139, std::string(MapType::kValueName), int64())),
"Map of column id to number of NaN values in the column");

static constexpr int32_t kLowerBoundsFieldId = 125;
inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
125, "lower_bounds",
std::make_shared<MapType>(
SchemaField::MakeRequired(126, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(127, std::string(MapType::kValueName),
iceberg::binary())),
kLowerBoundsFieldId, "lower_bounds",
map(SchemaField::MakeRequired(126, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(127, std::string(MapType::kValueName), binary())),
"Map of column id to lower bound");

static constexpr int32_t kUpperBoundsFieldId = 128;
inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
128, "upper_bounds",
std::make_shared<MapType>(
SchemaField::MakeRequired(129, std::string(MapType::kKeyName),
iceberg::int32()),
SchemaField::MakeRequired(130, std::string(MapType::kValueName),
iceberg::binary())),
kUpperBoundsFieldId, "upper_bounds",
map(SchemaField::MakeRequired(129, std::string(MapType::kKeyName), int32()),
SchemaField::MakeRequired(130, std::string(MapType::kValueName), binary())),
"Map of column id to upper bound");

static constexpr int32_t kKeyMetadataFieldId = 131;
inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
131, "key_metadata", iceberg::binary(), "Encryption key metadata blob");
kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");

static constexpr int32_t kSplitOffsetsFieldId = 132;
inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
132, "split_offsets",
std::make_shared<ListType>(SchemaField::MakeRequired(
133, std::string(ListType::kElementName), iceberg::int64())),
kSplitOffsetsFieldId, "split_offsets",
list(SchemaField::MakeRequired(133, std::string(ListType::kElementName), int64())),
"Splittable offsets");

static constexpr int32_t kEqualityIdsFieldId = 135;
inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
135, "equality_ids",
std::make_shared<ListType>(SchemaField::MakeRequired(
136, std::string(ListType::kElementName), iceberg::int32())),
kEqualityIdsFieldId, "equality_ids",
list(SchemaField::MakeRequired(136, std::string(ListType::kElementName), int32())),
"Equality comparison field IDs");
inline static const SchemaField kSortOrderId =
SchemaField::MakeOptional(140, "sort_order_id", iceberg::int32(), "Sort order ID");
inline static const SchemaField kFirstRowId = SchemaField::MakeOptional(
142, "first_row_id", iceberg::int64(), "Starting row ID to assign to new rows");

static constexpr int32_t kSortOrderIdFieldId = 140;
inline static const SchemaField kSortOrderId = SchemaField::MakeOptional(
kSortOrderIdFieldId, "sort_order_id", int32(), "Sort order ID");

static constexpr int32_t kFirstRowIdFieldId = 142;
inline static const SchemaField kFirstRowId =
SchemaField::MakeOptional(kFirstRowIdFieldId, "first_row_id", int64(),
"Starting row ID to assign to new rows");

static constexpr int32_t kReferencedDataFileFieldId = 143;
inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
143, "referenced_data_file", iceberg::string(),
kReferencedDataFileFieldId, "referenced_data_file", string(),
"Fully qualified location (URI with FS scheme) of a data file that all deletes "
"reference");

static constexpr int32_t kContentOffsetFieldId = 144;
inline static const SchemaField kContentOffset =
SchemaField::MakeOptional(144, "content_offset", iceberg::int64(),
SchemaField::MakeOptional(kContentOffsetFieldId, "content_offset", int64(),
"The offset in the file where the content starts");

static constexpr int32_t kContentSizeFieldId = 145;
inline static const SchemaField kContentSize =
SchemaField::MakeOptional(145, "content_size_in_bytes", iceberg::int64(),
SchemaField::MakeOptional(kContentSizeFieldId, "content_size_in_bytes", int64(),
"The length of referenced content stored in the file");

bool operator==(const DataFile& other) const = default;
Expand Down Expand Up @@ -298,16 +318,24 @@ struct ICEBERG_EXPORT ManifestEntry {
/// File path, partition tuple, metrics, ...
std::shared_ptr<DataFile> data_file;

static constexpr int32_t kStatusFieldId = 0;
inline static const SchemaField kStatus =
SchemaField::MakeRequired(0, "status", iceberg::int32());
SchemaField::MakeRequired(kStatusFieldId, "status", int32());

static constexpr int32_t kSnapshotIdFieldId = 1;
inline static const SchemaField kSnapshotId =
SchemaField::MakeOptional(1, "snapshot_id", iceberg::int64());
SchemaField::MakeOptional(kSnapshotIdFieldId, "snapshot_id", int64());

inline static const int32_t kDataFileFieldId = 2;
inline static const std::string kDataFileField = "data_file";

static constexpr int32_t kSequenceNumberFieldId = 3;
inline static const SchemaField kSequenceNumber =
SchemaField::MakeOptional(3, "sequence_number", iceberg::int64());
inline static const SchemaField kFileSequenceNumber =
SchemaField::MakeOptional(4, "file_sequence_number", iceberg::int64());
SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64());

static constexpr int32_t kFileSequenceNumberFieldId = 4;
inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(
kFileSequenceNumberFieldId, "file_sequence_number", int64());

/// \brief Check if this manifest entry is deleted.
constexpr bool IsAlive() const {
Expand Down
14 changes: 8 additions & 6 deletions src/iceberg/manifest/manifest_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,24 @@

#include "iceberg/manifest/manifest_list.h"

#include "iceberg/schema.h"
#include <memory>

#include "iceberg/type.h"

namespace iceberg {

const StructType& PartitionFieldSummary::Type() {
static const StructType kInstance{{
const std::shared_ptr<StructType>& PartitionFieldSummary::Type() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to change the return type from const T& to const std::shared_ptr<T>&?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now it is a singleton that can be reused.

static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
PartitionFieldSummary::kContainsNull,
PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound,
PartitionFieldSummary::kUpperBound,
}};
});
return kInstance;
}

const std::shared_ptr<Schema>& ManifestFile::Type() {
static const auto kInstance = std::make_shared<Schema>(std::vector<SchemaField>{
const std::shared_ptr<StructType>& ManifestFile::Type() {
static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
kManifestPath,
kManifestLength,
kPartitionSpecId,
Expand Down
85 changes: 60 additions & 25 deletions src/iceberg/manifest/manifest_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct ICEBERG_EXPORT PartitionFieldSummary {

bool operator==(const PartitionFieldSummary& other) const = default;

static const StructType& Type();
static const std::shared_ptr<StructType>& Type();
};

/// \brief The type of files tracked by the manifest, either data or delete files; 0 for
Expand Down Expand Up @@ -153,51 +153,86 @@ struct ICEBERG_EXPORT ManifestFile {
/// \brief Checks if this manifest file contains entries with DELETED status
bool has_deleted_files() const { return deleted_files_count.value_or(1) > 0; }

inline static const int32_t kManifestPathFieldId = 500;
inline static const SchemaField kManifestPath = SchemaField::MakeRequired(
500, "manifest_path", iceberg::string(), "Location URI with FS scheme");
kManifestPathFieldId, "manifest_path", string(), "Location URI with FS scheme");

inline static const int32_t kManifestLengthFieldId = 501;
inline static const SchemaField kManifestLength = SchemaField::MakeRequired(
501, "manifest_length", iceberg::int64(), "Total file size in bytes");
kManifestLengthFieldId, "manifest_length", int64(), "Total file size in bytes");

inline static const int32_t kPartitionSpecIdFieldId = 502;
inline static const SchemaField kPartitionSpecId = SchemaField::MakeRequired(
502, "partition_spec_id", iceberg::int32(), "Spec ID used to write");
kPartitionSpecIdFieldId, "partition_spec_id", int32(), "Spec ID used to write");

inline static const int32_t kContentFieldId = 517;
inline static const SchemaField kContent = SchemaField::MakeOptional(
517, "content", iceberg::int32(), "Contents of the manifest: 0=data, 1=deletes");
kContentFieldId, "content", int32(), "Contents of the manifest: 0=data, 1=deletes");

inline static const int32_t kSequenceNumberFieldId = 515;
inline static const SchemaField kSequenceNumber =
SchemaField::MakeOptional(515, "sequence_number", iceberg::int64(),
SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64(),
"Sequence number when the manifest was added");

inline static const int32_t kMinSequenceNumberFieldId = 516;
inline static const SchemaField kMinSequenceNumber =
SchemaField::MakeOptional(516, "min_sequence_number", iceberg::int64(),
SchemaField::MakeOptional(kMinSequenceNumberFieldId, "min_sequence_number", int64(),
"Lowest sequence number in the manifest");
inline static const SchemaField kAddedSnapshotId = SchemaField::MakeRequired(
503, "added_snapshot_id", iceberg::int64(), "Snapshot ID that added the manifest");

inline static const int32_t kAddedSnapshotIdFieldId = 503;
inline static const SchemaField kAddedSnapshotId =
SchemaField::MakeRequired(kAddedSnapshotIdFieldId, "added_snapshot_id", int64(),
"Snapshot ID that added the manifest");

inline static const int32_t kAddedFilesCountFieldId = 504;
inline static const SchemaField kAddedFilesCount = SchemaField::MakeOptional(
504, "added_files_count", iceberg::int32(), "Added entry count");
inline static const SchemaField kExistingFilesCount = SchemaField::MakeOptional(
505, "existing_files_count", iceberg::int32(), "Existing entry count");
kAddedFilesCountFieldId, "added_files_count", int32(), "Added entry count");

inline static const int32_t kExistingFilesCountFieldId = 505;
inline static const SchemaField kExistingFilesCount =
SchemaField::MakeOptional(kExistingFilesCountFieldId, "existing_files_count",
int32(), "Existing entry count");

inline static const int32_t kDeletedFilesCountFieldId = 506;
inline static const SchemaField kDeletedFilesCount = SchemaField::MakeOptional(
506, "deleted_files_count", iceberg::int32(), "Deleted entry count");
kDeletedFilesCountFieldId, "deleted_files_count", int32(), "Deleted entry count");

inline static const int32_t kAddedRowsCountFieldId = 512;
inline static const SchemaField kAddedRowsCount = SchemaField::MakeOptional(
512, "added_rows_count", iceberg::int64(), "Added rows count");
kAddedRowsCountFieldId, "added_rows_count", int64(), "Added rows count");

inline static const int32_t kExistingRowsCountFieldId = 513;
inline static const SchemaField kExistingRowsCount = SchemaField::MakeOptional(
513, "existing_rows_count", iceberg::int64(), "Existing rows count");
kExistingRowsCountFieldId, "existing_rows_count", int64(), "Existing rows count");

inline static const int32_t kDeletedRowsCountFieldId = 514;
inline static const SchemaField kDeletedRowsCount = SchemaField::MakeOptional(
514, "deleted_rows_count", iceberg::int64(), "Deleted rows count");
kDeletedRowsCountFieldId, "deleted_rows_count", int64(), "Deleted rows count");

inline static const int32_t kPartitionSummaryFieldId = 507;
inline static const SchemaField kPartitions = SchemaField::MakeOptional(
507, "partitions",
std::make_shared<ListType>(SchemaField::MakeRequired(
508, std::string(ListType::kElementName),
struct_(
{PartitionFieldSummary::kContainsNull, PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound, PartitionFieldSummary::kUpperBound}))),
kPartitionSummaryFieldId, "partitions",
list(SchemaField::MakeRequired(508, std::string(ListType::kElementName),
struct_({
PartitionFieldSummary::kContainsNull,
PartitionFieldSummary::kContainsNaN,
PartitionFieldSummary::kLowerBound,
PartitionFieldSummary::kUpperBound,
}))),
"Summary for each partition");

inline static const int32_t kKeyMetadataFieldId = 519;
inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
519, "key_metadata", iceberg::binary(), "Encryption key metadata blob");
kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");

inline static const int32_t kFirstRowIdFieldId = 520;
inline static const SchemaField kFirstRowId = SchemaField::MakeOptional(
520, "first_row_id", iceberg::int64(),
kFirstRowIdFieldId, "first_row_id", int64(),
"Starting row ID to assign to new rows in ADDED data files");

bool operator==(const ManifestFile& other) const = default;

static const std::shared_ptr<Schema>& Type();
static const std::shared_ptr<StructType>& Type();
};

/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are
Expand Down
Loading
Loading