From 25319724b04622d6f00c5f73d2dfca5b20ac47c7 Mon Sep 17 00:00:00 2001 From: Uttam Kumar Date: Thu, 26 Feb 2026 21:09:03 +0000 Subject: [PATCH] Add member ID 32-bit int type validation for Iceberg table schemas MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds validateMemberIdColumnTypes() to OpenHouseTablesApiValidator that rejects Iceberg table schemas using INTEGER type for member identity columns (memberId, actorId, profileId, customerId, sourceId, destId, etc.). Member IDs will overflow 32-bit int — all must use LONG. Integrated into both validateCreateTable() and validateUpdateTable(). See go/project-2b for details. --- .../impl/OpenHouseTablesApiValidator.java | 63 +++++++ .../tables/mock/api/TablesValidatorTest.java | 162 ++++++++++++++++++ 2 files changed, 225 insertions(+) diff --git a/services/tables/src/main/java/com/linkedin/openhouse/tables/api/validator/impl/OpenHouseTablesApiValidator.java b/services/tables/src/main/java/com/linkedin/openhouse/tables/api/validator/impl/OpenHouseTablesApiValidator.java index 3614e9c62..bba903340 100644 --- a/services/tables/src/main/java/com/linkedin/openhouse/tables/api/validator/impl/OpenHouseTablesApiValidator.java +++ b/services/tables/src/main/java/com/linkedin/openhouse/tables/api/validator/impl/OpenHouseTablesApiValidator.java @@ -14,8 +14,11 @@ import com.linkedin.openhouse.tables.common.TableType; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.UUID; import javax.validation.ConstraintViolation; import javax.validation.Validator; @@ -29,6 +32,25 @@ @Component public class OpenHouseTablesApiValidator implements TablesApiValidator { + private static final Set MEMBER_ID_CONTAINS_PATTERNS = + Collections.unmodifiableSet( + new HashSet<>( + Arrays.asList( + "actorid", + "customerid", + "destid", + "entityid", + "memberid", + "profileid", + "recipientid", + "recommenderid", + "reporterid", + "senderid", + "sourceid", + "viewerid"))); + + private static final Set MEMBER_ID_EXACT_PATTERNS = Collections.singleton("mid"); + @Autowired private Validator validator; @Autowired private RetentionPolicySpecValidator retentionPolicySpecValidator; @@ -106,6 +128,9 @@ && getSchemaFromSchemaJson(createUpdateTableRequestBody.getSchema()).columns().i validationFailures.addAll(validateUUIDForReplicaTable(createUpdateTableRequestBody)); validationFailures.addAll( validateUpdateTimestampForReplicatedTable(createUpdateTableRequestBody)); + if (createUpdateTableRequestBody.getSchema() != null) { + validateMemberIdColumnTypes(createUpdateTableRequestBody.getSchema(), validationFailures); + } if (!validationFailures.isEmpty()) { throw new RequestValidationFailureException(validationFailures); @@ -257,6 +282,9 @@ && getSchemaFromSchemaJson(createUpdateTableRequestBody.getSchema()).columns().i createUpdateTableRequestBody.getSortOrder(), createUpdateTableRequestBody.getSchema(), validationFailures); + if (createUpdateTableRequestBody.getSchema() != null) { + validateMemberIdColumnTypes(createUpdateTableRequestBody.getSchema(), validationFailures); + } if (!validationFailures.isEmpty()) { throw new RequestValidationFailureException(validationFailures); } @@ -404,4 +432,39 @@ private void validateSortOrder(String sortOrder, String schema, List val } } } + + /** + * Validates that Iceberg table schemas do not use INTEGER type for member identity columns. + * Member IDs will exceed 32-bit int max — all must use LONG. See go/project-2b. + */ + private void validateMemberIdColumnTypes(String schemaJson, List validationFailures) { + try { + Schema icebergSchema = getSchemaFromSchemaJson(schemaJson); + for (org.apache.iceberg.types.Types.NestedField column : icebergSchema.columns()) { + if (column.type().typeId() == org.apache.iceberg.types.Type.TypeID.INTEGER) { + String normalized = column.name().toLowerCase().replace("_", ""); + if (MEMBER_ID_EXACT_PATTERNS.contains(normalized)) { + validationFailures.add( + String.format( + "schema : column '%s' uses INTEGER type for a member identity field. " + + "Use LONG instead to avoid overflow. See go/project-2b for details.", + column.name())); + } else { + for (String pattern : MEMBER_ID_CONTAINS_PATTERNS) { + if (normalized.contains(pattern)) { + validationFailures.add( + String.format( + "schema : column '%s' uses INTEGER type for a member identity field. " + + "Use LONG instead to avoid overflow. See go/project-2b for details.", + column.name())); + break; + } + } + } + } + } + } catch (Exception e) { + // Schema parsing is already validated elsewhere; skip member ID check if unparseable + } + } } diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/api/TablesValidatorTest.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/api/TablesValidatorTest.java index 2aaf2d69b..1da7dffff 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/api/TablesValidatorTest.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/mock/api/TablesValidatorTest.java @@ -1345,4 +1345,166 @@ public void validateCreateTableWithPrimaryTableType() { .tableType(TableType.PRIMARY_TABLE) .build())); } + + @Test + public void validateCreateTable_memberIdLongColumnSucceeds() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"memberId\",\"type\":\"long\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertDoesNotThrow( + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateCreateTable_memberIdIntColumnFails() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"memberId\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertThrows( + RequestValidationFailureException.class, + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateCreateTable_nonMemberIdIntColumnSucceeds() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"retryCount\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertDoesNotThrow( + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateCreateTable_memberIdNewPatternsFail() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"customerId\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"sourceId\",\"type\":\"int\"}," + + "{\"id\":3,\"required\":true,\"name\":\"destId\",\"type\":\"int\"}," + + "{\"id\":4,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertThrows( + RequestValidationFailureException.class, + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateCreateTable_exactMatchMidFails() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"mid\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertThrows( + RequestValidationFailureException.class, + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateCreateTable_underscoreMemberIdFails() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"member_id\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertThrows( + RequestValidationFailureException.class, + () -> + tablesApiValidator.validateCreateTable( + "c", + "d", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } + + @Test + public void validateUpdateTable_memberIdIntColumnFails() { + String schemaJson = + "{\"type\":\"struct\",\"fields\":[" + + "{\"id\":1,\"required\":true,\"name\":\"memberId\",\"type\":\"int\"}," + + "{\"id\":2,\"required\":true,\"name\":\"name\",\"type\":\"string\"}" + + "]}"; + assertThrows( + RequestValidationFailureException.class, + () -> + tablesApiValidator.validateUpdateTable( + "c", + "d", + "t", + CreateUpdateTableRequestBody.builder() + .databaseId("d") + .tableId("t") + .clusterId("c") + .schema(schemaJson) + .tableProperties(ImmutableMap.of()) + .baseTableVersion("base") + .build())); + } }