Skip to content

Commit 1006ae4

Browse files
committed
feat: assign fresh field ids for new schema
1 parent 7b95952 commit 1006ae4

File tree

8 files changed

+445
-20
lines changed

8 files changed

+445
-20
lines changed

src/iceberg/schema.cc

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@
3333

3434
namespace iceberg {
3535

36-
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
37-
: StructType(std::move(fields)), schema_id_(schema_id) {}
36+
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
37+
std::vector<int32_t> identifier_field_ids)
38+
: StructType(std::move(fields)),
39+
schema_id_(schema_id),
40+
identifier_field_ids_(std::move(identifier_field_ids)) {}
3841

3942
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
4043

@@ -48,15 +51,16 @@ std::string Schema::ToString() const {
4851
}
4952

5053
bool Schema::Equals(const Schema& other) const {
51-
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
54+
return schema_id_ == other.schema_id_ && fields_ == other.fields_ &&
55+
identifier_field_ids_ == other.identifier_field_ids_;
5256
}
5357

5458
Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
5559
std::string_view name, bool case_sensitive) const {
5660
if (case_sensitive) {
57-
ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this));
58-
auto it = name_to_id.get().find(name);
59-
if (it == name_to_id.get().end()) {
61+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
62+
auto it = name_id_map.get().name_to_id.find(name);
63+
if (it == name_id_map.get().name_to_id.end()) {
6064
return std::nullopt;
6165
};
6266
return FindFieldById(it->second);
@@ -77,21 +81,21 @@ Schema::InitIdToFieldMap(const Schema& self) {
7781
return id_to_field;
7882
}
7983

80-
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
81-
Schema::InitNameToIdMap(const Schema& self) {
82-
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
83-
NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
84+
Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
85+
NameIdMap name_id_map;
86+
NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
87+
/*case_sensitive=*/true);
8488
ICEBERG_RETURN_UNEXPECTED(
8589
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
8690
visitor.Finish();
87-
return name_to_id;
91+
return name_id_map;
8892
}
8993

9094
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
9195
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
9296
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
9397
lowercase_name_to_id;
94-
NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
98+
NameToIdVisitor visitor(lowercase_name_to_id, nullptr, /*case_sensitive=*/false);
9599
ICEBERG_RETURN_UNEXPECTED(
96100
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
97101
visitor.Finish();
@@ -108,6 +112,15 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
108112
return it->second;
109113
}
110114

115+
Result<std::optional<std::string>> Schema::FindColumnNameById(int32_t field_id) const {
116+
ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
117+
auto it = name_id_map.get().id_to_name.find(field_id);
118+
if (it == name_id_map.get().id_to_name.end()) {
119+
return std::nullopt;
120+
}
121+
return it->second;
122+
}
123+
111124
Result<std::unordered_map<int32_t, std::vector<size_t>>> Schema::InitIdToPositionPath(
112125
const Schema& self) {
113126
PositionPathVisitor visitor;
@@ -179,4 +192,25 @@ Result<std::unique_ptr<Schema>> Schema::Project(
179192
std::nullopt);
180193
}
181194

195+
std::vector<int32_t> Schema::IdentifierFieldIds() const { return identifier_field_ids_; }
196+
197+
void Schema::SetIdentifierFieldIds(std::vector<int32_t> identifier_field_ids) {
198+
identifier_field_ids_ = std::move(identifier_field_ids);
199+
}
200+
201+
Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
202+
using std::ranges::to;
203+
using std::views::transform;
204+
std::vector<std::string> names;
205+
names.reserve(identifier_field_ids_.size());
206+
for (auto id : identifier_field_ids_) {
207+
ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
208+
if (!name.has_value()) {
209+
return InvalidSchema("Can not find the field of the specified field id: {}", id);
210+
}
211+
names.push_back(name.value());
212+
}
213+
return names;
214+
}
215+
182216
} // namespace iceberg

src/iceberg/schema.h

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class ICEBERG_EXPORT Schema : public StructType {
4949
static constexpr int32_t kInvalidColumnId = -1;
5050

5151
explicit Schema(std::vector<SchemaField> fields,
52-
std::optional<int32_t> schema_id = std::nullopt);
52+
std::optional<int32_t> schema_id = std::nullopt,
53+
std::vector<int32_t> identifier_field_ids = {});
5354

5455
/// \brief Get the schema ID.
5556
///
@@ -78,6 +79,12 @@ class ICEBERG_EXPORT Schema : public StructType {
7879
Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
7980
int32_t field_id) const;
8081

82+
/// \brief Returns the full column name for the given id.
83+
///
84+
/// \param field_id The id of the field to get the full name for.
85+
/// \return The full name of the field with the given id, or std::nullopt if not found.
86+
Result<std::optional<std::string>> FindColumnNameById(int32_t field_id) const;
87+
8188
/// \brief Get the accessor to access the field by field id.
8289
///
8390
/// \param field_id The id of the field to get the accessor for.
@@ -103,26 +110,46 @@ class ICEBERG_EXPORT Schema : public StructType {
103110
Result<std::unique_ptr<Schema>> Project(
104111
const std::unordered_set<int32_t>& field_ids) const;
105112

113+
std::vector<int32_t> IdentifierFieldIds() const;
114+
void SetIdentifierFieldIds(std::vector<int32_t> identifier_field_ids);
115+
Result<std::vector<std::string>> IdentifierFieldNames() const;
116+
106117
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
107118

108119
private:
109120
/// \brief Compare two schemas for equality.
110121
bool Equals(const Schema& other) const;
111122

123+
struct NameIdMap {
124+
/// \brief Mapping from full field name to ID
125+
///
126+
/// \note Short names for maps and lists are included for any name that does not
127+
/// conflict with a canonical name. For example, a list, 'l', of structs with field
128+
/// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
129+
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
130+
131+
/// \brief Mapping from field ID to full name
132+
///
133+
/// \note Canonical names, but not short names are set, for example
134+
/// 'list.element.field' instead of 'list.field'.
135+
std::unordered_map<int32_t, std::string> id_to_name;
136+
};
137+
112138
static Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
113139
InitIdToFieldMap(const Schema&);
114-
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
115-
InitNameToIdMap(const Schema&);
140+
static Result<NameIdMap> InitNameIdMap(const Schema&);
116141
static Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
117142
InitLowerCaseNameToIdMap(const Schema&);
118143
static Result<std::unordered_map<int32_t, std::vector<size_t>>> InitIdToPositionPath(
119144
const Schema&);
120145

121146
const std::optional<int32_t> schema_id_;
147+
/// Field IDs that uniquely identify rows in the table.
148+
std::vector<int32_t> identifier_field_ids_;
122149
/// Mapping from field id to field.
123150
Lazy<InitIdToFieldMap> id_to_field_;
124151
/// Mapping from field name to field id.
125-
Lazy<InitNameToIdMap> name_to_id_;
152+
Lazy<InitNameIdMap> name_id_map_;
126153
/// Mapping from lowercased field name to field id
127154
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
128155
/// Mapping from field id to (nested) position path to access the field.

src/iceberg/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ endfunction()
5454

5555
add_iceberg_test(schema_test
5656
SOURCES
57+
assign_id_visitor_test.cc
5758
name_mapping_test.cc
5859
partition_field_test.cc
5960
partition_spec_test.cc
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <memory>
21+
22+
#include <gmock/gmock.h>
23+
#include <gtest/gtest.h>
24+
25+
#include "iceberg/schema.h"
26+
#include "iceberg/schema_field.h"
27+
#include "iceberg/test/matchers.h"
28+
#include "iceberg/type.h"
29+
#include "iceberg/util/type_util.h"
30+
31+
namespace iceberg {
32+
33+
namespace {
34+
35+
Schema CreateFlatSchema() {
36+
return Schema({
37+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
38+
SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()),
39+
SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()),
40+
SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()),
41+
});
42+
}
43+
44+
std::shared_ptr<Type> CreateListOfStruct() {
45+
return std::make_shared<ListType>(SchemaField::MakeOptional(
46+
/*field_id=*/101, "element",
47+
std::make_shared<StructType>(std::vector<SchemaField>{
48+
SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()),
49+
SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()),
50+
})));
51+
}
52+
53+
std::shared_ptr<Type> CreateMapWithStructValue() {
54+
return std::make_shared<MapType>(
55+
SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()),
56+
SchemaField::MakeRequired(
57+
/*field_id=*/202, "value",
58+
std::make_shared<StructType>(std::vector<SchemaField>{
59+
SchemaField::MakeRequired(/*field_id=*/203, "id", iceberg::int64()),
60+
SchemaField::MakeOptional(/*field_id=*/204, "name", iceberg::string()),
61+
})));
62+
}
63+
64+
std::shared_ptr<Type> CreateNestedStruct() {
65+
return std::make_shared<StructType>(std::vector<SchemaField>{
66+
SchemaField::MakeRequired(/*field_id=*/301, "outer_id", iceberg::int64()),
67+
SchemaField::MakeRequired(
68+
/*field_id=*/302, "nested",
69+
std::make_shared<StructType>(std::vector<SchemaField>{
70+
SchemaField::MakeOptional(/*field_id=*/303, "inner_id", iceberg::int32()),
71+
SchemaField::MakeRequired(/*field_id=*/304, "inner_name",
72+
iceberg::string()),
73+
})),
74+
});
75+
}
76+
77+
Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
78+
return Schema(
79+
{
80+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
81+
SchemaField::MakeOptional(/*field_id=*/20, "list", CreateListOfStruct()),
82+
SchemaField::MakeOptional(/*field_id=*/30, "map", CreateMapWithStructValue()),
83+
SchemaField::MakeRequired(/*field_id=*/40, "struct", CreateNestedStruct()),
84+
},
85+
Schema::kInitialSchemaId, std::move(identifier_field_ids));
86+
}
87+
88+
void ExpectStructEQ(const StructType& expect, const StructType& actual) {
89+
EXPECT_EQ(expect.fields().size(), actual.fields().size());
90+
for (int32_t i = 0; i < expect.fields().size(); ++i) {
91+
EXPECT_EQ(expect.fields()[i].field_id(), actual.fields()[i].field_id());
92+
EXPECT_EQ(expect.fields()[i].name(), actual.fields()[i].name());
93+
EXPECT_EQ(*expect.fields()[i].type(), *actual.fields()[i].type());
94+
EXPECT_EQ(expect.fields()[i].optional(), actual.fields()[i].optional());
95+
}
96+
}
97+
98+
} // namespace
99+
100+
TEST(AssignFreshIdVisitorTest, FlatSchema) {
101+
Schema schema = CreateFlatSchema();
102+
103+
std::atomic<int32_t> id = 0;
104+
auto next_id = [&id]() { return ++id; };
105+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
106+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
107+
108+
ASSERT_EQ(fresh_schema->fields().size(), schema.fields().size());
109+
ExpectStructEQ(
110+
Schema({
111+
SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()),
112+
SchemaField::MakeOptional(/*field_id=*/2, "name", iceberg::string()),
113+
SchemaField::MakeOptional(/*field_id=*/3, "age", iceberg::int32()),
114+
SchemaField::MakeRequired(/*field_id=*/4, "data", iceberg::float64()),
115+
}),
116+
*fresh_schema);
117+
}
118+
119+
TEST(AssignFreshIdVisitorTest, NestedSchema) {
120+
Schema schema = CreateNestedSchema();
121+
std::atomic<int32_t> id = 0;
122+
auto next_id = [&id]() { return ++id; };
123+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
124+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
125+
126+
ASSERT_EQ(4, fresh_schema->fields().size());
127+
for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) {
128+
EXPECT_EQ(i + 1, fresh_schema->fields()[i].field_id());
129+
}
130+
131+
auto list_field = fresh_schema->fields()[1];
132+
auto list_type = std::dynamic_pointer_cast<ListType>(list_field.type());
133+
ASSERT_TRUE(list_type);
134+
auto list_element_field = list_type->fields()[0];
135+
EXPECT_EQ(5, list_element_field.field_id());
136+
auto list_element_type =
137+
std::dynamic_pointer_cast<StructType>(list_element_field.type());
138+
ASSERT_TRUE(list_element_type);
139+
ExpectStructEQ(StructType(std::vector<SchemaField>{
140+
SchemaField::MakeOptional(/*field_id=*/6, "x", iceberg::int32()),
141+
SchemaField::MakeRequired(/*field_id=*/7, "y", iceberg::string()),
142+
}),
143+
*list_element_type);
144+
145+
auto map_field = fresh_schema->fields()[2];
146+
auto map_type = std::dynamic_pointer_cast<MapType>(map_field.type());
147+
ASSERT_TRUE(map_type);
148+
EXPECT_EQ(8, map_type->fields()[0].field_id());
149+
auto map_value_field = map_type->fields()[1];
150+
EXPECT_EQ(9, map_value_field.field_id());
151+
auto map_value_type = std::dynamic_pointer_cast<StructType>(map_value_field.type());
152+
ASSERT_TRUE(map_value_type);
153+
ExpectStructEQ(
154+
StructType(std::vector<SchemaField>{
155+
SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
156+
SchemaField::MakeOptional(/*field_id=*/11, "name", iceberg::string()),
157+
}),
158+
*map_value_type);
159+
160+
auto struct_field = fresh_schema->fields()[3];
161+
auto struct_type = std::dynamic_pointer_cast<StructType>(struct_field.type());
162+
ASSERT_TRUE(struct_type);
163+
164+
auto expect_nested_struct_type = std::make_shared<StructType>(std::vector<SchemaField>{
165+
SchemaField::MakeOptional(/*field_id=*/14, "inner_id", iceberg::int32()),
166+
SchemaField::MakeRequired(/*field_id=*/15, "inner_name", iceberg::string()),
167+
});
168+
ExpectStructEQ(
169+
StructType(std::vector<SchemaField>{
170+
SchemaField::MakeRequired(/*field_id=*/12, "outer_id", iceberg::int64()),
171+
SchemaField::MakeRequired(
172+
/*field_id=*/13, "nested", expect_nested_struct_type)}),
173+
*struct_type);
174+
175+
auto nested_struct_field = struct_type->fields()[1];
176+
auto nested_struct_type =
177+
std::dynamic_pointer_cast<StructType>(nested_struct_field.type());
178+
ASSERT_TRUE(nested_struct_type);
179+
ExpectStructEQ(*expect_nested_struct_type, *nested_struct_type);
180+
}
181+
182+
TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) {
183+
std::atomic<int32_t> id = 0;
184+
auto next_id = [&id]() { return ++id; };
185+
186+
Schema invalid_schema = CreateNestedSchema({10, 400});
187+
// Invalid identified field id
188+
auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema, next_id);
189+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema));
190+
EXPECT_THAT(result, HasErrorMessage("Can not find"));
191+
192+
id = 0;
193+
Schema schema = CreateNestedSchema({10, 301});
194+
ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
195+
AssignFreshIds(Schema::kInitialSchemaId, schema, next_id));
196+
EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12));
197+
}
198+
199+
} // namespace iceberg

src/iceberg/test/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ configure_file(
3030
iceberg_tests = {
3131
'schema_test': {
3232
'sources': files(
33+
'assign_id_visitor_test.cc',
3334
'name_mapping_test.cc',
3435
'partition_field_test.cc',
3536
'partition_spec_test.cc',

0 commit comments

Comments
 (0)