-
Notifications
You must be signed in to change notification settings - Fork 76
Normalize write schema casing to table casing at catalog layer (Make OH reads/writes case-insensitive) #558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
cbb330
merged 4 commits into
linkedin:main
from
pandaamit91:ampanda/oh-case-insensitive-schema-writes
May 2, 2026
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
76e0f08
feat: normalize write schema casing to table casing at catalog layer
pandaamit91 609aaf0
refactor: extend casing normalization to nested fields via TypeUtil.v…
pandaamit91 197983c
refactor: promote SchemaValidationUtil upstream from li-openhouse
pandaamit91 910bd70
test: add missing write-path coverage for case-insensitive normalization
pandaamit91 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
95 changes: 95 additions & 0 deletions
95
...les/src/main/java/com/linkedin/openhouse/tables/repository/impl/SchemaValidationUtil.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| package com.linkedin.openhouse.tables.repository.impl; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Locale; | ||
| import java.util.Map; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.types.Type; | ||
| import org.apache.iceberg.types.Types; | ||
|
|
||
| /** | ||
| * Utility for validating Iceberg schemas with respect to case-insensitive column naming. | ||
| * | ||
| * <p>Promoted from li-openhouse's {@code SchemaValidationUtil} to the upstream open-source repo so | ||
| * that both sides share a single implementation. The two callers want opposite outcomes from the | ||
| * same predicate: | ||
| * | ||
| * <ul> | ||
| * <li><b>li-openhouse write rejection</b>: if duplicates exist → reject the write. | ||
| * <li><b>{@link BaseIcebergSchemaValidator} normalization guard</b>: if duplicates exist → skip | ||
| * normalization (write may still succeed for exact-casing). | ||
| * </ul> | ||
| * | ||
| * <p>Coverage: struct fields at any nesting depth, list element types, and map key/value types. | ||
| * Fields with the same name in <em>different</em> structs (e.g. {@code user.id} and {@code | ||
| * session.id}) are correctly treated as independent — only <em>siblings</em> within the same struct | ||
| * are compared. | ||
| */ | ||
| public final class SchemaValidationUtil { | ||
|
|
||
| private SchemaValidationUtil() {} | ||
|
|
||
| /** | ||
| * Finds all sibling-field pairs at any nesting depth whose names are equal case-insensitively but | ||
| * not case-sensitively. | ||
| * | ||
| * @param schema the Iceberg schema to validate | ||
| * @return list of conflict descriptions, e.g. {@code "[userid, userId]"} or {@code "[col1.userid, | ||
| * col1.userId]"}; empty if no conflicts | ||
| */ | ||
| public static List<String> findDuplicateCaseInsensitiveColumnNames(Schema schema) { | ||
| List<String> conflicts = new ArrayList<>(); | ||
| checkFieldsForDuplicates(schema.columns(), "", conflicts); | ||
| return conflicts; | ||
| } | ||
|
|
||
| /** | ||
| * Returns {@code true} if the schema has any case-insensitive duplicate field names at any | ||
| * nesting depth. | ||
| */ | ||
| public static boolean hasDuplicateCaseInsensitiveColumnNames(Schema schema) { | ||
| return !findDuplicateCaseInsensitiveColumnNames(schema).isEmpty(); | ||
| } | ||
|
|
||
| /** | ||
| * Checks {@code fields} for sibling-level case-insensitive duplicates, records any conflicts, and | ||
| * recurses into each field's type. | ||
| */ | ||
| private static void checkFieldsForDuplicates( | ||
| List<Types.NestedField> fields, String pathPrefix, List<String> conflicts) { | ||
| Map<String, String> seenLowerToName = new HashMap<>(); | ||
| for (Types.NestedField field : fields) { | ||
| String name = field.name(); | ||
| String lower = name.toLowerCase(Locale.ROOT); | ||
| if (seenLowerToName.containsKey(lower)) { | ||
| String first = seenLowerToName.get(lower); | ||
| if (!first.equals(name)) { | ||
| String qualifier = pathPrefix.isEmpty() ? "" : pathPrefix + "."; | ||
| conflicts.add(String.format("[%s%s, %s%s]", qualifier, first, qualifier, name)); | ||
| } | ||
| } else { | ||
| seenLowerToName.put(lower, name); | ||
| } | ||
| } | ||
| for (Types.NestedField field : fields) { | ||
| String childPath = pathPrefix.isEmpty() ? field.name() : pathPrefix + "." + field.name(); | ||
| checkTypeForDuplicates(field.type(), childPath, conflicts); | ||
| } | ||
| } | ||
|
|
||
| /** Recurses into compound types (struct, list element, map key/value). Primitives are a no-op. */ | ||
| private static void checkTypeForDuplicates(Type type, String path, List<String> conflicts) { | ||
| if (type instanceof Types.StructType) { | ||
| checkFieldsForDuplicates(((Types.StructType) type).fields(), path, conflicts); | ||
| } else if (type instanceof Types.ListType) { | ||
| checkTypeForDuplicates(((Types.ListType) type).elementType(), path, conflicts); | ||
| } else if (type instanceof Types.MapType) { | ||
| Types.MapType mapType = (Types.MapType) type; | ||
| checkTypeForDuplicates(mapType.keyType(), path, conflicts); | ||
| checkTypeForDuplicates(mapType.valueType(), path, conflicts); | ||
| } | ||
| // Primitive types have no nested fields; nothing to recurse into. | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.