Skip to content

Commit dd3c2f7

Browse files
committed
feat: assign fresh field ids for new schema
1 parent 7b95952 commit dd3c2f7

File tree

8 files changed

+452
-21
lines changed

8 files changed

+452
-21
lines changed

src/iceberg/schema.cc

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,28 @@
3333

3434
namespace iceberg {
3535

36-
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
37-
: StructType(std::move(fields)), schema_id_(schema_id) {}
36+
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
37+
std::vector<int32_t> identifier_field_ids)
38+
: StructType(std::move(fields)),
39+
schema_id_(schema_id),
40+
identifier_field_ids_(std::move(identifier_field_ids)) {}
41+
42+
Result<std::unique_ptr<Schema>> Schema::Make(
43+
std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
44+
const std::vector<std::string>& identifier_field_names) {
45+
auto schema = std::make_unique<Schema>(std::move(fields), schema_id);
46+
47+
std::vector<int32_t> fresh_identifier_ids;
48+
for (const auto& name : identifier_field_names) {
49+
ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(name));
50+
if (!field) {
51+
return InvalidSchema("Cannot find identifier field: {}", name);
52+
}
53+
fresh_identifier_ids.push_back(field.value().get().field_id());
54+
}
55+
schema->identifier_field_ids_ = std::move(fresh_identifier_ids);
56+
return schema;
57+
}
3858

3959
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
4060

@@ -48,15 +68,16 @@ std::string Schema::ToString() const {
4868
}
4969

5070
bool Schema::Equals(const Schema& other) const {
51-
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
71+
return schema_id_ == other.schema_id_ && fields_ == other.fields_ &&
72+
identifier_field_ids_ == other.identifier_field_ids_;
5273
}
5374

5475
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
5576
std::string_view name, bool case_sensitive) const {
5677
if (case_sensitive) {
57-
ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this));
58-
auto it = name_to_id.get().find(name);
59-
if (it == name_to_id.get().end()) {
78+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
79+
auto it = name_id_map.get().name_to_id.find(name);
80+
if (it == name_id_map.get().name_to_id.end()) {
6081
return std::nullopt;
6182
};
6283
return FindFieldById(it->second);
@@ -77,21 +98,22 @@ Schema::InitIdToFieldMap(const Schema& self) {
7798
return id_to_field;
7899
}
79100

80-
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
81-
Schema::InitNameToIdMap(const Schema& self) {
82-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
83-
NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
101+
Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
102+
NameIdMap name_id_map;
103+
NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
104+
/*case_sensitive=*/true);
84105
ICEBERG_RETURN_UNEXPECTED(
85106
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
86107
visitor.Finish();
87-
return name_to_id;
108+
return name_id_map;
88109
}
89110

90111
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
91112
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
92113
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
93114
lowercase_name_to_id;
94-
NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
115+
NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
116+
/*case_sensitive=*/false);
95117
ICEBERG_RETURN_UNEXPECTED(
96118
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
97119
visitor.Finish();
@@ -108,6 +130,16 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
108130
return it->second;
109131
}
110132

133+
Result<std::optional<std::string_view>> Schema::FindColumnNameById(
134+
int32_t field_id) const {
135+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
136+
auto it = name_id_map.get().id_to_name.find(field_id);
137+
if (it == name_id_map.get().id_to_name.end()) {
138+
return std::nullopt;
139+
}
140+
return it->second;
141+
}
142+
111143
Result<std::unordered_map<int32_t, std::vector<size_t>>> Schema::InitIdToPositionPath(
112144
const Schema& self) {
113145
PositionPathVisitor visitor;
@@ -179,4 +211,21 @@ Result<std::unique_ptr<Schema>> Schema::Project(
179211
std::nullopt);
180212
}
181213

214+
const std::vector<int32_t>& Schema::IdentifierFieldIds() const {
215+
return identifier_field_ids_;
216+
}
217+
218+
Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
219+
std::vector<std::string> names;
220+
names.reserve(identifier_field_ids_.size());
221+
for (auto id : identifier_field_ids_) {
222+
ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
223+
if (!name.has_value()) {
224+
return InvalidSchema("Cannot find the field of the specified field id: {}", id);
225+
}
226+
names.emplace_back(name.value());
227+
}
228+
return names;
229+
}
230+
182231
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,18 @@ class ICEBERG_EXPORT Schema : public StructType {
4949
static constexpr int32_t kInvalidColumnId = -1;
5050

5151
explicit Schema(std::vector<SchemaField> fields,
52-
std::optional<int32_t> schema_id = std::nullopt);
52+
std::optional<int32_t> schema_id = std::nullopt,
53+
std::vector<int32_t> identifier_field_ids = {});
54+
55+
/// \brief Create a schema.
56+
///
57+
/// \param fields The fields that make up the schema.
58+
/// \param schema_id The unique identifier for this schema (default: kInitialSchemaId).
59+
/// \param identifier_field_names Canonical names of fields that uniquely identify rows
60+
/// in the table (default: empty). \return A new Schema instance or Status if failed.
61+
static Result<std::unique_ptr<Schema>> Make(
62+
std::vector<SchemaField> fields, std::optional<int32_t> schema_id = std::nullopt,
63+
const std::vector<std::string>& identifier_field_names = {});
5364

5465
/// \brief Get the schema ID.
5566
///
@@ -78,6 +89,13 @@ class ICEBERG_EXPORT Schema : public StructType {
7889
Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
7990
int32_t field_id) const;
8091

92+
/// \brief Returns the canonical field name for the given id.
93+
///
94+
/// \param field_id The id of the field to get the canonical name for.
95+
/// \return The canocinal column name of the field with the given id, or std::nullopt if
96+
/// not found.
97+
Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;
98+
8199
/// \brief Get the accessor to access the field by field id.
82100
///
83101
/// \param field_id The id of the field to get the accessor for.
@@ -103,26 +121,48 @@ class ICEBERG_EXPORT Schema : public StructType {
103121
Result<std::unique_ptr<Schema>> Project(
104122
const std::unordered_set<int32_t>& field_ids) const;
105123

124+
/// \brief Return the field IDs of the identifier fields.
125+
const std::vector<int32_t>& IdentifierFieldIds() const;
126+
127+
/// \brief Return the canonical field names of the identifier fields.
128+
Result<std::vector<std::string>> IdentifierFieldNames() const;
129+
106130
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
107131

108132
private:
109133
/// \brief Compare two schemas for equality.
110134
bool Equals(const Schema& other) const;
111135

136+
struct NameIdMap {
137+
/// \brief Mapping from canonical field name to ID
138+
///
139+
/// \note Short names for maps and lists are included for any name that does not
140+
/// conflict with a canonical name. For example, a list, 'l', of structs with field
141+
/// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
142+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
143+
144+
/// \brief Mapping from field ID to canonical name
145+
///
146+
/// \note Canonical names, but not short names are set, for example
147+
/// 'list.element.field' instead of 'list.field'.
148+
std::unordered_map<int32_t, std::string> id_to_name;
149+
};
150+
112151
static Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
113152
InitIdToFieldMap(const Schema&);
114-
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
115-
InitNameToIdMap(const Schema&);
153+
static Result<NameIdMap> InitNameIdMap(const Schema&);
116154
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
117155
InitLowerCaseNameToIdMap(const Schema&);
118156
static Result<std::unordered_map<int32_t, std::vector<size_t>>> InitIdToPositionPath(
119157
const Schema&);
120158

121159
const std::optional<int32_t> schema_id_;
160+
/// Field IDs that uniquely identify rows in the table.
161+
std::vector<int32_t> identifier_field_ids_;
122162
/// Mapping from field id to field.
123163
Lazy<InitIdToFieldMap> id_to_field_;
124164
/// Mapping from field name to field id.
125-
Lazy<InitNameToIdMap> name_to_id_;
165+
Lazy<InitNameIdMap> name_id_map_;
126166
/// Mapping from lowercased field name to field id
127167
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
128168
/// Mapping from field id to (nested) position path to access the field.

src/iceberg/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ endfunction()
5454

5555
add_iceberg_test(schema_test
5656
SOURCES
57+
assign_id_visitor_test.cc
5758
name_mapping_test.cc
5859
partition_field_test.cc
5960
partition_spec_test.cc

0 commit comments

Comments
 (0)