From 96554b71db41c03d8a74b79790c7d5bbbd30a4a7 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 12:29:39 +0200 Subject: [PATCH 1/6] Add stub methods for SchemaElement conversion refactor scope #1835 --- .../schema/ParquetSchemaConverter.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java new file mode 100644 index 0000000000..0487d2ba05 --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.format.ColumnOrder; +import org.apache.parquet.format.SchemaElement; +import java.util.List; + +/** + * This class converts between the thrift based {@link org.apache.parquet.format.SchemaElement} class and the + * parquet-column {@link org.apache.parquet.column.ColumnDescriptor} classes. + */ +public class ParquetSchemaConverter +{ + public ParquetSchemaConverter() { + } + + /** + * Parse parquet-column MessageType and write the {@link ColumnOrder} objects for it. + * + * @param schema the {@link MessageType} schema + * @return the ordering defined for each of the columns + */ + public List getColumnOrders(MessageType schema) { + // todo + throw new UnsupportedOperationException(); + } + + /** + * Parse parquet-column MessageType and write the {@link SchemaElement} objects for it. + * + * @param schema the {@link MessageType} schema + * @return the {@link SchemaElement} objects for this parquet-column schema + */ + public List toParquetSchema(MessageType schema) { + // todo. + throw new UnsupportedOperationException(); + } + + /** + * Create parquet-column MessageType from {@link SchemaElement} and {@link ColumnOrder} thrift objects + * + * @param schema the {@link MessageType} schema + * @return the ordering defined for each of the columns + */ + public MessageType fromParquetSchema(List schema, List columnOrders) { + // todo + throw new UnsupportedOperationException(); + } +} From e5092bce18c47cce7d05c112864af8cc50302052 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 12:49:43 +0200 Subject: [PATCH 2/6] move SchemaElement converter code to parquet-column project --- parquet-column/pom.xml | 5 + .../schema/ParquetSchemaConverter.java | 642 +++++++++++++++++- .../converter/ParquetMetadataConverter.java | 638 +---------------- 3 files changed, 639 insertions(+), 646 deletions(-) diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 8bd11dcc26..64ddf6861b 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -48,6 +48,11 @@ test-jar test + + org.apache.parquet + parquet-format-structures + ${project.version} + org.apache.parquet parquet-encoding diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java index 0487d2ba05..bd1404893e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java @@ -18,17 +18,345 @@ */ package org.apache.parquet.schema; +import static java.util.Optional.empty; +import static java.util.Optional.of; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; import org.apache.parquet.format.ColumnOrder; +import org.apache.parquet.format.ConvertedType; +import org.apache.parquet.format.DecimalType; +import org.apache.parquet.format.EdgeInterpolationAlgorithm; +import org.apache.parquet.format.FieldRepetitionType; +import org.apache.parquet.format.GeographyType; +import org.apache.parquet.format.GeometryType; +import org.apache.parquet.format.IntType; +import org.apache.parquet.format.LogicalType; +import org.apache.parquet.format.LogicalTypes; +import org.apache.parquet.format.MicroSeconds; +import org.apache.parquet.format.MilliSeconds; +import org.apache.parquet.format.NanoSeconds; import org.apache.parquet.format.SchemaElement; -import java.util.List; +import org.apache.parquet.format.TimeType; +import org.apache.parquet.format.TimeUnit; +import org.apache.parquet.format.TimestampType; +import org.apache.parquet.format.Type; +import org.apache.parquet.format.TypeDefinedOrder; +import org.apache.parquet.format.VariantType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class converts between the thrift based {@link org.apache.parquet.format.SchemaElement} class and the * parquet-column {@link org.apache.parquet.column.ColumnDescriptor} classes. */ -public class ParquetSchemaConverter -{ - public ParquetSchemaConverter() { +public class ParquetSchemaConverter { + private static final Logger LOG = LoggerFactory.getLogger(ParquetSchemaConverter.class); + private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); + private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR = + new LogicalTypeConverterVisitor(); + private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR = + new ConvertedTypeConverterVisitor(); + + public ParquetSchemaConverter() {} + + /** + * Create parquet-column MessageType from {@link SchemaElement} and {@link ColumnOrder} thrift objects + * + * @param schema the {@link MessageType} schema + * @return the ordering defined for each of the columns + */ + public MessageType fromParquetSchema(List schema, List columnOrders) { + Iterator iterator = schema.iterator(); + SchemaElement root = iterator.next(); + Types.MessageTypeBuilder builder = Types.buildMessage(); + if (root.isSetField_id()) { + builder.id(root.field_id); + } + buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); + return builder.named(root.name); + } + + private void buildChildren( + Types.GroupBuilder builder, + Iterator schema, + int childrenCount, + List columnOrders, + int columnCount) { + for (int i = 0; i < childrenCount; i++) { + SchemaElement schemaElement = schema.next(); + + // Create Parquet Type. + Types.Builder childBuilder; + if (schemaElement.type != null) { + Types.PrimitiveBuilder primitiveBuilder = builder.primitive( + getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type)); + if (schemaElement.isSetType_length()) { + primitiveBuilder.length(schemaElement.type_length); + } + if (schemaElement.isSetPrecision()) { + primitiveBuilder.precision(schemaElement.precision); + } + if (schemaElement.isSetScale()) { + primitiveBuilder.scale(schemaElement.scale); + } + if (columnOrders != null) { + org.apache.parquet.schema.ColumnOrder columnOrder = + fromParquetColumnOrder(columnOrders.get(columnCount)); + // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for + // the types + // where ordering is not supported. + if (columnOrder.getColumnOrderName() + == org.apache.parquet.schema.ColumnOrder.ColumnOrderName.TYPE_DEFINED_ORDER + && (schemaElement.type == Type.INT96 + || schemaElement.converted_type == ConvertedType.INTERVAL)) { + columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); + } + primitiveBuilder.columnOrder(columnOrder); + } + childBuilder = primitiveBuilder; + } else { + childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); + buildChildren( + (Types.GroupBuilder) childBuilder, + schema, + schemaElement.num_children, + columnOrders, + columnCount); + } + + if (schemaElement.isSetLogicalType()) { + childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType)); + } + if (schemaElement.isSetConverted_type()) { + OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement) + .toOriginalType(); + OriginalType newOriginalType = (schemaElement.isSetLogicalType() + && getLogicalTypeAnnotation(schemaElement.logicalType) != null) + ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() + : null; + if (!originalType.equals(newOriginalType)) { + if (newOriginalType != null) { + LOG.warn( + "Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.", + schemaElement.converted_type, + schemaElement.logicalType); + } + childBuilder.as(originalType); + } + } + if (schemaElement.isSetField_id()) { + childBuilder.id(schemaElement.field_id); + } + + childBuilder.named(schemaElement.name); + ++columnCount; + } + } + + // Visible for testing + FieldRepetitionType toParquetRepetition(org.apache.parquet.schema.Type.Repetition repetition) { + return FieldRepetitionType.valueOf(repetition.name()); + } + + // Visible for testing + org.apache.parquet.schema.Type.Repetition fromParquetRepetition(FieldRepetitionType repetition) { + return org.apache.parquet.schema.Type.Repetition.valueOf(repetition.name()); + } + + private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) { + if (columnOrder.isSetTYPE_ORDER()) { + return org.apache.parquet.schema.ColumnOrder.typeDefined(); + } + // The column order is not yet supported by this API + return org.apache.parquet.schema.ColumnOrder.undefined(); + } + + public PrimitiveType.PrimitiveTypeName getPrimitive(Type type) { + switch (type) { + case BYTE_ARRAY: // TODO: rename BINARY and remove this switch + return PrimitiveType.PrimitiveTypeName.BINARY; + case INT64: + return PrimitiveType.PrimitiveTypeName.INT64; + case INT32: + return PrimitiveType.PrimitiveTypeName.INT32; + case BOOLEAN: + return PrimitiveType.PrimitiveTypeName.BOOLEAN; + case FLOAT: + return PrimitiveType.PrimitiveTypeName.FLOAT; + case DOUBLE: + return PrimitiveType.PrimitiveTypeName.DOUBLE; + case INT96: + return PrimitiveType.PrimitiveTypeName.INT96; + case FIXED_LEN_BYTE_ARRAY: + return PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + default: + throw new RuntimeException("Unknown type " + type); + } + } + + public Type getType(PrimitiveType.PrimitiveTypeName type) { + switch (type) { + case INT64: + return Type.INT64; + case INT32: + return Type.INT32; + case BOOLEAN: + return Type.BOOLEAN; + case BINARY: + return Type.BYTE_ARRAY; + case FLOAT: + return Type.FLOAT; + case DOUBLE: + return Type.DOUBLE; + case INT96: + return Type.INT96; + case FIXED_LEN_BYTE_ARRAY: + return Type.FIXED_LEN_BYTE_ARRAY; + default: + throw new RuntimeException("Unknown primitive type " + type); + } + } + + // Visible for testing + LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { + switch (type) { + case UTF8: + return LogicalTypeAnnotation.stringType(); + case MAP: + return LogicalTypeAnnotation.mapType(); + case MAP_KEY_VALUE: + return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); + case LIST: + return LogicalTypeAnnotation.listType(); + case ENUM: + return LogicalTypeAnnotation.enumType(); + case DECIMAL: + int scale = (schemaElement == null ? 0 : schemaElement.scale); + int precision = (schemaElement == null ? 0 : schemaElement.precision); + return LogicalTypeAnnotation.decimalType(scale, precision); + case DATE: + return LogicalTypeAnnotation.dateType(); + case TIME_MILLIS: + return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); + case TIME_MICROS: + return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); + case TIMESTAMP_MILLIS: + return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); + case TIMESTAMP_MICROS: + return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); + case INTERVAL: + return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); + case INT_8: + return LogicalTypeAnnotation.intType(8, true); + case INT_16: + return LogicalTypeAnnotation.intType(16, true); + case INT_32: + return LogicalTypeAnnotation.intType(32, true); + case INT_64: + return LogicalTypeAnnotation.intType(64, true); + case UINT_8: + return LogicalTypeAnnotation.intType(8, false); + case UINT_16: + return LogicalTypeAnnotation.intType(16, false); + case UINT_32: + return LogicalTypeAnnotation.intType(32, false); + case UINT_64: + return LogicalTypeAnnotation.intType(64, false); + case JSON: + return LogicalTypeAnnotation.jsonType(); + case BSON: + return LogicalTypeAnnotation.bsonType(); + default: + throw new RuntimeException( + "Can't convert converted type to logical type, unknown converted type " + type); + } + } + + LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { + switch (type.getSetField()) { + case MAP: + return LogicalTypeAnnotation.mapType(); + case BSON: + return LogicalTypeAnnotation.bsonType(); + case DATE: + return LogicalTypeAnnotation.dateType(); + case ENUM: + return LogicalTypeAnnotation.enumType(); + case JSON: + return LogicalTypeAnnotation.jsonType(); + case LIST: + return LogicalTypeAnnotation.listType(); + case TIME: + TimeType time = type.getTIME(); + return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit)); + case STRING: + return LogicalTypeAnnotation.stringType(); + case DECIMAL: + DecimalType decimal = type.getDECIMAL(); + return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision); + case INTEGER: + IntType integer = type.getINTEGER(); + return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned); + case UNKNOWN: + return LogicalTypeAnnotation.unknownType(); + case TIMESTAMP: + TimestampType timestamp = type.getTIMESTAMP(); + return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); + case UUID: + return LogicalTypeAnnotation.uuidType(); + case FLOAT16: + return LogicalTypeAnnotation.float16Type(); + case GEOMETRY: + GeometryType geometry = type.getGEOMETRY(); + return LogicalTypeAnnotation.geometryType(geometry.getCrs()); + case GEOGRAPHY: + GeographyType geography = type.getGEOGRAPHY(); + return LogicalTypeAnnotation.geographyType( + geography.getCrs(), toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm())); + case VARIANT: + VariantType variant = type.getVARIANT(); + return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); + default: + throw new RuntimeException("Unknown logical type " + type); + } + } + + private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { + switch (unit.getSetField()) { + case MICROS: + return LogicalTypeAnnotation.TimeUnit.MICROS; + case MILLIS: + return LogicalTypeAnnotation.TimeUnit.MILLIS; + case NANOS: + return LogicalTypeAnnotation.TimeUnit.NANOS; + default: + throw new RuntimeException("Unknown time unit " + unit); + } + } + + /** Convert Parquet Algorithm enum to Thrift Algorithm enum */ + public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { + if (parquetAlgo == null) { + return null; + } + EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue()); + if (thriftAlgo == null) { + throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo); + } + return thriftAlgo; + } + + /** Convert Thrift Algorithm enum to Parquet Algorithm enum */ + public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm thriftAlgo) { + if (thriftAlgo == null) { + return null; + } + return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue()); } /** @@ -38,8 +366,15 @@ public ParquetSchemaConverter() { * @return the ordering defined for each of the columns */ public List getColumnOrders(MessageType schema) { - // todo - throw new UnsupportedOperationException(); + List columnOrders = new ArrayList<>(); + // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with + // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. + for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { + ColumnOrder columnOrder = new ColumnOrder(); + columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); + columnOrders.add(columnOrder); + } + return columnOrders; } /** @@ -49,18 +384,291 @@ public List getColumnOrders(MessageType schema) { * @return the {@link SchemaElement} objects for this parquet-column schema */ public List toParquetSchema(MessageType schema) { - // todo. - throw new UnsupportedOperationException(); + List result = new ArrayList(); + addToList(result, schema); + return result; } - /** - * Create parquet-column MessageType from {@link SchemaElement} and {@link ColumnOrder} thrift objects - * - * @param schema the {@link MessageType} schema - * @return the ordering defined for each of the columns - */ - public MessageType fromParquetSchema(List schema, List columnOrders) { - // todo - throw new UnsupportedOperationException(); + private void addToList(final List result, org.apache.parquet.schema.Type field) { + field.accept(new TypeVisitor() { + @Override + public void visit(PrimitiveType primitiveType) { + SchemaElement element = new SchemaElement(primitiveType.getName()); + element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition())); + element.setType(getType(primitiveType.getPrimitiveTypeName())); + if (primitiveType.getLogicalTypeAnnotation() != null) { + element.setConverted_type(convertToConvertedType(primitiveType.getLogicalTypeAnnotation())); + element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation())); + } + if (primitiveType.getDecimalMetadata() != null) { + element.setPrecision(primitiveType.getDecimalMetadata().getPrecision()); + element.setScale(primitiveType.getDecimalMetadata().getScale()); + } + if (primitiveType.getTypeLength() > 0) { + element.setType_length(primitiveType.getTypeLength()); + } + if (primitiveType.getId() != null) { + element.setField_id(primitiveType.getId().intValue()); + } + result.add(element); + } + + @Override + public void visit(MessageType messageType) { + SchemaElement element = new SchemaElement(messageType.getName()); + if (messageType.getId() != null) { + element.setField_id(messageType.getId().intValue()); + } + visitChildren(result, messageType.asGroupType(), element); + } + + @Override + public void visit(GroupType groupType) { + SchemaElement element = new SchemaElement(groupType.getName()); + element.setRepetition_type(toParquetRepetition(groupType.getRepetition())); + if (groupType.getLogicalTypeAnnotation() != null) { + element.setConverted_type(convertToConvertedType(groupType.getLogicalTypeAnnotation())); + element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation())); + } + if (groupType.getId() != null) { + element.setField_id(groupType.getId().intValue()); + } + visitChildren(result, groupType, element); + } + + private void visitChildren(final List result, GroupType groupType, SchemaElement element) { + element.setNum_children(groupType.getFieldCount()); + result.add(element); + for (org.apache.parquet.schema.Type field : groupType.getFields()) { + addToList(result, field); + } + } + }); + } + + LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) { + return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null); + } + + ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) { + return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null); + } + + static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) { + switch (unit) { + case MICROS: + return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds()); + case MILLIS: + return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds()); + case NANOS: + return TimeUnit.NANOS(new NanoSeconds()); + default: + throw new RuntimeException("Unknown time unit " + unit); + } + } + + private static class ConvertedTypeConverterVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(ConvertedType.UTF8); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(ConvertedType.MAP); + } + + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(ConvertedType.LIST); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(ConvertedType.ENUM); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(ConvertedType.DECIMAL); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(ConvertedType.DATE); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + switch (timeLogicalType.getUnit()) { + case MILLIS: + return of(ConvertedType.TIME_MILLIS); + case MICROS: + return of(ConvertedType.TIME_MICROS); + case NANOS: + return empty(); + default: + throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType()); + } + } + + @Override + public Optional visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + switch (timestampLogicalType.getUnit()) { + case MICROS: + return of(ConvertedType.TIMESTAMP_MICROS); + case MILLIS: + return of(ConvertedType.TIMESTAMP_MILLIS); + case NANOS: + return empty(); + default: + throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType()); + } + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + boolean signed = intLogicalType.isSigned(); + switch (intLogicalType.getBitWidth()) { + case 8: + return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); + case 16: + return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); + case 32: + return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); + case 64: + return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); + default: + throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType()); + } + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(ConvertedType.JSON); + } + + @Override + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(ConvertedType.BSON); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(ConvertedType.INTERVAL); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { + return of(ConvertedType.MAP_KEY_VALUE); + } + } + + private static class LogicalTypeConverterVisitor + implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return of(LogicalTypes.UTF8); + } + + @Override + public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { + return of(LogicalTypes.MAP); + } + + @Override + public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { + return of(LogicalTypes.LIST); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return of(LogicalTypes.ENUM); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { + return of(LogicalTypes.DECIMAL(decimalLogicalType.getScale(), decimalLogicalType.getPrecision())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { + return of(LogicalTypes.DATE); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + return of(LogicalType.TIME( + new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit())))); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + return of(LogicalType.TIMESTAMP(new TimestampType( + timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit())))); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { + return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned()))); + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return of(LogicalTypes.JSON); + } + + @Override + public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { + return of(LogicalTypes.BSON); + } + + @Override + public Optional visit(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { + return of(LogicalTypes.UUID); + } + + @Override + public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { + return of(LogicalTypes.FLOAT16); + } + + @Override + public Optional visit(LogicalTypeAnnotation.UnknownLogicalTypeAnnotation unknownLogicalType) { + return of(LogicalTypes.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { + return of(LogicalTypes.UNKNOWN); + } + + @Override + public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) { + return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion())); + } + + @Override + public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { + GeometryType geometryType = new GeometryType(); + if (geometryLogicalType.getCrs() != null + && !geometryLogicalType.getCrs().isEmpty()) { + geometryType.setCrs(geometryLogicalType.getCrs()); + } + return of(LogicalType.GEOMETRY(geometryType)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { + GeographyType geographyType = new GeographyType(); + if (geographyLogicalType.getCrs() != null + && !geographyLogicalType.getCrs().isEmpty()) { + geographyType.setCrs(geographyLogicalType.getCrs()); + } + geographyType.setAlgorithm(fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm())); + return of(LogicalType.GEOGRAPHY(geographyType)); + } } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 002028cdf5..f1288c955d 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -18,7 +18,6 @@ */ package org.apache.parquet.format.converter; -import static java.util.Optional.empty; import static java.util.Optional.of; import static org.apache.parquet.format.Util.readColumnMetaData; import static org.apache.parquet.format.Util.readFileMetaData; @@ -36,7 +35,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -71,45 +69,25 @@ import org.apache.parquet.format.ColumnCryptoMetaData; import org.apache.parquet.format.ColumnIndex; import org.apache.parquet.format.ColumnMetaData; -import org.apache.parquet.format.ColumnOrder; import org.apache.parquet.format.CompressionCodec; -import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; -import org.apache.parquet.format.DecimalType; import org.apache.parquet.format.DictionaryPageHeader; -import org.apache.parquet.format.EdgeInterpolationAlgorithm; import org.apache.parquet.format.Encoding; import org.apache.parquet.format.EncryptionWithColumnKey; -import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; -import org.apache.parquet.format.GeographyType; -import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.GeospatialStatistics; -import org.apache.parquet.format.IntType; import org.apache.parquet.format.KeyValue; -import org.apache.parquet.format.LogicalType; -import org.apache.parquet.format.LogicalTypes; -import org.apache.parquet.format.MicroSeconds; -import org.apache.parquet.format.MilliSeconds; -import org.apache.parquet.format.NanoSeconds; import org.apache.parquet.format.OffsetIndex; import org.apache.parquet.format.PageEncodingStats; import org.apache.parquet.format.PageHeader; import org.apache.parquet.format.PageLocation; import org.apache.parquet.format.PageType; import org.apache.parquet.format.RowGroup; -import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.SizeStatistics; import org.apache.parquet.format.SplitBlockAlgorithm; import org.apache.parquet.format.Statistics; -import org.apache.parquet.format.TimeType; -import org.apache.parquet.format.TimeUnit; -import org.apache.parquet.format.TimestampType; -import org.apache.parquet.format.Type; -import org.apache.parquet.format.TypeDefinedOrder; import org.apache.parquet.format.Uncompressed; -import org.apache.parquet.format.VariantType; import org.apache.parquet.format.XxHash; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; @@ -125,16 +103,13 @@ import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; -import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.ParquetSchemaConverter; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; -import org.apache.parquet.schema.TypeVisitor; -import org.apache.parquet.schema.Types; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -142,7 +117,6 @@ // TODO: Lets split it up: https://github.com/apache/parquet-java/issues/1835 public class ParquetMetadataConverter { - private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k @@ -157,10 +131,7 @@ public class ParquetMetadataConverter { private static final int DEFAULT_MAX_MESSAGE_SIZE = 104857600; // 100 MB private static final Logger LOG = LoggerFactory.getLogger(ParquetMetadataConverter.class); - private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR = - new LogicalTypeConverterVisitor(); - private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR = - new ConvertedTypeConverterVisitor(); + private final ParquetSchemaConverter parquetSchemaConverter = new ParquetSchemaConverter(); private final int statisticsTruncateLength; private final boolean useSignedStringMinMax; private final ParquetReadOptions options; @@ -258,7 +229,8 @@ public FileMetaData toParquetMetadata( } FileMetaData fileMetaData = new FileMetaData( currentVersion, - toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), + parquetSchemaConverter.toParquetSchema( + parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); @@ -270,314 +242,12 @@ public FileMetaData toParquetMetadata( fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); - fileMetaData.setColumn_orders( - getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); + fileMetaData.setColumn_orders(parquetSchemaConverter.getColumnOrders( + parquetMetadata.getFileMetaData().getSchema())); return fileMetaData; } - private List getColumnOrders(MessageType schema) { - List columnOrders = new ArrayList<>(); - // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with - // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. - for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { - ColumnOrder columnOrder = new ColumnOrder(); - columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); - columnOrders.add(columnOrder); - } - return columnOrders; - } - - // Visible for testing - List toParquetSchema(MessageType schema) { - List result = new ArrayList(); - addToList(result, schema); - return result; - } - - private void addToList(final List result, org.apache.parquet.schema.Type field) { - field.accept(new TypeVisitor() { - @Override - public void visit(PrimitiveType primitiveType) { - SchemaElement element = new SchemaElement(primitiveType.getName()); - element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition())); - element.setType(getType(primitiveType.getPrimitiveTypeName())); - if (primitiveType.getLogicalTypeAnnotation() != null) { - element.setConverted_type(convertToConvertedType(primitiveType.getLogicalTypeAnnotation())); - element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation())); - } - if (primitiveType.getDecimalMetadata() != null) { - element.setPrecision(primitiveType.getDecimalMetadata().getPrecision()); - element.setScale(primitiveType.getDecimalMetadata().getScale()); - } - if (primitiveType.getTypeLength() > 0) { - element.setType_length(primitiveType.getTypeLength()); - } - if (primitiveType.getId() != null) { - element.setField_id(primitiveType.getId().intValue()); - } - result.add(element); - } - - @Override - public void visit(MessageType messageType) { - SchemaElement element = new SchemaElement(messageType.getName()); - if (messageType.getId() != null) { - element.setField_id(messageType.getId().intValue()); - } - visitChildren(result, messageType.asGroupType(), element); - } - - @Override - public void visit(GroupType groupType) { - SchemaElement element = new SchemaElement(groupType.getName()); - element.setRepetition_type(toParquetRepetition(groupType.getRepetition())); - if (groupType.getLogicalTypeAnnotation() != null) { - element.setConverted_type(convertToConvertedType(groupType.getLogicalTypeAnnotation())); - element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation())); - } - if (groupType.getId() != null) { - element.setField_id(groupType.getId().intValue()); - } - visitChildren(result, groupType, element); - } - - private void visitChildren(final List result, GroupType groupType, SchemaElement element) { - element.setNum_children(groupType.getFieldCount()); - result.add(element); - for (org.apache.parquet.schema.Type field : groupType.getFields()) { - addToList(result, field); - } - } - }); - } - - LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) { - return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null); - } - - ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) { - return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null); - } - - static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) { - switch (unit) { - case MICROS: - return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds()); - case MILLIS: - return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds()); - case NANOS: - return TimeUnit.NANOS(new NanoSeconds()); - default: - throw new RuntimeException("Unknown time unit " + unit); - } - } - - private static class ConvertedTypeConverterVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { - @Override - public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return of(ConvertedType.UTF8); - } - - @Override - public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { - return of(ConvertedType.MAP); - } - - @Override - public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { - return of(ConvertedType.LIST); - } - - @Override - public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return of(ConvertedType.ENUM); - } - - @Override - public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { - return of(ConvertedType.DECIMAL); - } - - @Override - public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return of(ConvertedType.DATE); - } - - @Override - public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - switch (timeLogicalType.getUnit()) { - case MILLIS: - return of(ConvertedType.TIME_MILLIS); - case MICROS: - return of(ConvertedType.TIME_MICROS); - case NANOS: - return empty(); - default: - throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit( - LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - switch (timestampLogicalType.getUnit()) { - case MICROS: - return of(ConvertedType.TIMESTAMP_MICROS); - case MILLIS: - return of(ConvertedType.TIMESTAMP_MILLIS); - case NANOS: - return empty(); - default: - throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - boolean signed = intLogicalType.isSigned(); - switch (intLogicalType.getBitWidth()) { - case 8: - return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); - case 16: - return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); - case 32: - return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); - case 64: - return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); - default: - throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType()); - } - } - - @Override - public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return of(ConvertedType.JSON); - } - - @Override - public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return of(ConvertedType.BSON); - } - - @Override - public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { - return of(ConvertedType.INTERVAL); - } - - @Override - public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { - return of(ConvertedType.MAP_KEY_VALUE); - } - } - - private static class LogicalTypeConverterVisitor - implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { - @Override - public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return of(LogicalTypes.UTF8); - } - - @Override - public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { - return of(LogicalTypes.MAP); - } - - @Override - public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { - return of(LogicalTypes.LIST); - } - - @Override - public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return of(LogicalTypes.ENUM); - } - - @Override - public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { - return of(LogicalTypes.DECIMAL(decimalLogicalType.getScale(), decimalLogicalType.getPrecision())); - } - - @Override - public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { - return of(LogicalTypes.DATE); - } - - @Override - public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - return of(LogicalType.TIME( - new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit())))); - } - - @Override - public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - return of(LogicalType.TIMESTAMP(new TimestampType( - timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit())))); - } - - @Override - public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { - return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned()))); - } - - @Override - public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return of(LogicalTypes.JSON); - } - - @Override - public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { - return of(LogicalTypes.BSON); - } - - @Override - public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) { - return of(LogicalTypes.UUID); - } - - @Override - public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) { - return of(LogicalTypes.FLOAT16); - } - - @Override - public Optional visit(LogicalTypeAnnotation.UnknownLogicalTypeAnnotation unknownLogicalType) { - return of(LogicalTypes.UNKNOWN); - } - - @Override - public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { - return of(LogicalTypes.UNKNOWN); - } - - @Override - public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) { - return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion())); - } - - @Override - public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { - GeometryType geometryType = new GeometryType(); - if (geometryLogicalType.getCrs() != null - && !geometryLogicalType.getCrs().isEmpty()) { - geometryType.setCrs(geometryLogicalType.getCrs()); - } - return of(LogicalType.GEOMETRY(geometryType)); - } - - @Override - public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) { - GeographyType geographyType = new GeographyType(); - if (geographyLogicalType.getCrs() != null - && !geographyLogicalType.getCrs().isEmpty()) { - geographyType.setCrs(geographyLogicalType.getCrs()); - } - geographyType.setAlgorithm(fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm())); - return of(LogicalType.GEOGRAPHY(geographyType)); - } - } - private void addRowGroup( ParquetMetadata parquetMetadata, List rowGroups, @@ -606,7 +276,7 @@ private void addRowGroup( encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup); } ColumnMetaData metaData = new ColumnMetaData( - getType(columnMetaData.getType()), + parquetSchemaConverter.getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), columnMetaData.getPath().toList(), toFormatCodec(columnMetaData.getCodec()), @@ -1214,170 +884,6 @@ public Optional visit( return defaultSortOrder(primitive.getPrimitiveTypeName()); } - public PrimitiveTypeName getPrimitive(Type type) { - switch (type) { - case BYTE_ARRAY: // TODO: rename BINARY and remove this switch - return PrimitiveTypeName.BINARY; - case INT64: - return PrimitiveTypeName.INT64; - case INT32: - return PrimitiveTypeName.INT32; - case BOOLEAN: - return PrimitiveTypeName.BOOLEAN; - case FLOAT: - return PrimitiveTypeName.FLOAT; - case DOUBLE: - return PrimitiveTypeName.DOUBLE; - case INT96: - return PrimitiveTypeName.INT96; - case FIXED_LEN_BYTE_ARRAY: - return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown type " + type); - } - } - - // Visible for testing - Type getType(PrimitiveTypeName type) { - switch (type) { - case INT64: - return Type.INT64; - case INT32: - return Type.INT32; - case BOOLEAN: - return Type.BOOLEAN; - case BINARY: - return Type.BYTE_ARRAY; - case FLOAT: - return Type.FLOAT; - case DOUBLE: - return Type.DOUBLE; - case INT96: - return Type.INT96; - case FIXED_LEN_BYTE_ARRAY: - return Type.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown primitive type " + type); - } - } - - // Visible for testing - LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { - switch (type) { - case UTF8: - return LogicalTypeAnnotation.stringType(); - case MAP: - return LogicalTypeAnnotation.mapType(); - case MAP_KEY_VALUE: - return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); - case LIST: - return LogicalTypeAnnotation.listType(); - case ENUM: - return LogicalTypeAnnotation.enumType(); - case DECIMAL: - int scale = (schemaElement == null ? 0 : schemaElement.scale); - int precision = (schemaElement == null ? 0 : schemaElement.precision); - return LogicalTypeAnnotation.decimalType(scale, precision); - case DATE: - return LogicalTypeAnnotation.dateType(); - case TIME_MILLIS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIME_MICROS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case TIMESTAMP_MILLIS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIMESTAMP_MICROS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case INTERVAL: - return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); - case INT_8: - return LogicalTypeAnnotation.intType(8, true); - case INT_16: - return LogicalTypeAnnotation.intType(16, true); - case INT_32: - return LogicalTypeAnnotation.intType(32, true); - case INT_64: - return LogicalTypeAnnotation.intType(64, true); - case UINT_8: - return LogicalTypeAnnotation.intType(8, false); - case UINT_16: - return LogicalTypeAnnotation.intType(16, false); - case UINT_32: - return LogicalTypeAnnotation.intType(32, false); - case UINT_64: - return LogicalTypeAnnotation.intType(64, false); - case JSON: - return LogicalTypeAnnotation.jsonType(); - case BSON: - return LogicalTypeAnnotation.bsonType(); - default: - throw new RuntimeException( - "Can't convert converted type to logical type, unknown converted type " + type); - } - } - - LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { - switch (type.getSetField()) { - case MAP: - return LogicalTypeAnnotation.mapType(); - case BSON: - return LogicalTypeAnnotation.bsonType(); - case DATE: - return LogicalTypeAnnotation.dateType(); - case ENUM: - return LogicalTypeAnnotation.enumType(); - case JSON: - return LogicalTypeAnnotation.jsonType(); - case LIST: - return LogicalTypeAnnotation.listType(); - case TIME: - TimeType time = type.getTIME(); - return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit)); - case STRING: - return LogicalTypeAnnotation.stringType(); - case DECIMAL: - DecimalType decimal = type.getDECIMAL(); - return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision); - case INTEGER: - IntType integer = type.getINTEGER(); - return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned); - case UNKNOWN: - return LogicalTypeAnnotation.unknownType(); - case TIMESTAMP: - TimestampType timestamp = type.getTIMESTAMP(); - return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); - case UUID: - return LogicalTypeAnnotation.uuidType(); - case FLOAT16: - return LogicalTypeAnnotation.float16Type(); - case GEOMETRY: - GeometryType geometry = type.getGEOMETRY(); - return LogicalTypeAnnotation.geometryType(geometry.getCrs()); - case GEOGRAPHY: - GeographyType geography = type.getGEOGRAPHY(); - return LogicalTypeAnnotation.geographyType( - geography.getCrs(), toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm())); - case VARIANT: - VariantType variant = type.getVARIANT(); - return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); - default: - throw new RuntimeException("Unknown logical type " + type); - } - } - - private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { - switch (unit.getSetField()) { - case MICROS: - return LogicalTypeAnnotation.TimeUnit.MICROS; - case MILLIS: - return LogicalTypeAnnotation.TimeUnit.MILLIS; - case NANOS: - return LogicalTypeAnnotation.TimeUnit.NANOS; - default: - throw new RuntimeException("Unknown time unit " + unit); - } - } - private static void addKeyValue(FileMetaData fileMetaData, String key, String value) { KeyValue keyValue = new KeyValue(key); keyValue.value = value; @@ -1824,7 +1330,8 @@ public ParquetMetadata fromParquetMetadata( boolean encryptedFooter, Map rowGroupToRowIndexOffsetMap) throws IOException { - MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); + MessageType messageType = parquetSchemaConverter.fromParquetSchema( + parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); @@ -1987,111 +1494,6 @@ private static ColumnPath getPath(ColumnMetaData metaData) { } // Visible for testing - MessageType fromParquetSchema(List schema, List columnOrders) { - Iterator iterator = schema.iterator(); - SchemaElement root = iterator.next(); - Types.MessageTypeBuilder builder = Types.buildMessage(); - if (root.isSetField_id()) { - builder.id(root.field_id); - } - buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); - return builder.named(root.name); - } - - private void buildChildren( - Types.GroupBuilder builder, - Iterator schema, - int childrenCount, - List columnOrders, - int columnCount) { - for (int i = 0; i < childrenCount; i++) { - SchemaElement schemaElement = schema.next(); - - // Create Parquet Type. - Types.Builder childBuilder; - if (schemaElement.type != null) { - Types.PrimitiveBuilder primitiveBuilder = builder.primitive( - getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type)); - if (schemaElement.isSetType_length()) { - primitiveBuilder.length(schemaElement.type_length); - } - if (schemaElement.isSetPrecision()) { - primitiveBuilder.precision(schemaElement.precision); - } - if (schemaElement.isSetScale()) { - primitiveBuilder.scale(schemaElement.scale); - } - if (columnOrders != null) { - org.apache.parquet.schema.ColumnOrder columnOrder = - fromParquetColumnOrder(columnOrders.get(columnCount)); - // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for - // the types - // where ordering is not supported. - if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER - && (schemaElement.type == Type.INT96 - || schemaElement.converted_type == ConvertedType.INTERVAL)) { - columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); - } - primitiveBuilder.columnOrder(columnOrder); - } - childBuilder = primitiveBuilder; - } else { - childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); - buildChildren( - (Types.GroupBuilder) childBuilder, - schema, - schemaElement.num_children, - columnOrders, - columnCount); - } - - if (schemaElement.isSetLogicalType()) { - childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType)); - } - if (schemaElement.isSetConverted_type()) { - OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement) - .toOriginalType(); - OriginalType newOriginalType = (schemaElement.isSetLogicalType() - && getLogicalTypeAnnotation(schemaElement.logicalType) != null) - ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() - : null; - if (!originalType.equals(newOriginalType)) { - if (newOriginalType != null) { - LOG.warn( - "Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.", - schemaElement.converted_type, - schemaElement.logicalType); - } - childBuilder.as(originalType); - } - } - if (schemaElement.isSetField_id()) { - childBuilder.id(schemaElement.field_id); - } - - childBuilder.named(schemaElement.name); - ++columnCount; - } - } - - // Visible for testing - FieldRepetitionType toParquetRepetition(Repetition repetition) { - return FieldRepetitionType.valueOf(repetition.name()); - } - - // Visible for testing - Repetition fromParquetRepetition(FieldRepetitionType repetition) { - return Repetition.valueOf(repetition.name()); - } - - private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) { - if (columnOrder.isSetTYPE_ORDER()) { - return org.apache.parquet.schema.ColumnOrder.typeDefined(); - } - // The column order is not yet supported by this API - return org.apache.parquet.schema.ColumnOrder.undefined(); - } - @Deprecated public void writeDataPageHeader( int uncompressedSize, @@ -2673,26 +2075,4 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s } return formatStats; } - - /** Convert Parquet Algorithm enum to Thrift Algorithm enum */ - public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { - if (parquetAlgo == null) { - return null; - } - EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue()); - if (thriftAlgo == null) { - throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo); - } - return thriftAlgo; - } - - /** Convert Thrift Algorithm enum to Parquet Algorithm enum */ - public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( - EdgeInterpolationAlgorithm thriftAlgo) { - if (thriftAlgo == null) { - return null; - } - return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue()); - } } From 5bb1c9d7c727e116923a6ce568fce378b82732e1 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 13:47:16 +0200 Subject: [PATCH 3/6] add delegates for moved public methods for compatibility --- .../converter/ParquetMetadataConverter.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index f1288c955d..883ce6d11a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -73,6 +73,7 @@ import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.EdgeInterpolationAlgorithm; import org.apache.parquet.format.Encoding; import org.apache.parquet.format.EncryptionWithColumnKey; import org.apache.parquet.format.FileMetaData; @@ -87,6 +88,7 @@ import org.apache.parquet.format.SizeStatistics; import org.apache.parquet.format.SplitBlockAlgorithm; import org.apache.parquet.format.Statistics; +import org.apache.parquet.format.Type; import org.apache.parquet.format.Uncompressed; import org.apache.parquet.format.XxHash; import org.apache.parquet.hadoop.metadata.BlockMetaData; @@ -884,6 +886,11 @@ public Optional visit( return defaultSortOrder(primitive.getPrimitiveTypeName()); } + @Deprecated + public PrimitiveTypeName getPrimitive(Type type) { + return parquetSchemaConverter.getPrimitive(type); + } + private static void addKeyValue(FileMetaData fileMetaData, String key, String value) { KeyValue keyValue = new KeyValue(key); keyValue.value = value; @@ -2075,4 +2082,16 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s } return formatStats; } + + @Deprecated + public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { + return ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); + } + + @Deprecated + public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm thriftAlgo) { + return ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); + } } From ad61a1976033c9620b5a22979e59e1e054df9d97 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 13:21:00 +0200 Subject: [PATCH 4/6] Move schema tests to parquet-column project --- .../schema/TestParquetSchemaConverter.java | 531 ++++++++++++++++++ .../TestParquetMetadataConverter.java | 500 +---------------- 2 files changed, 540 insertions(+), 491 deletions(-) create mode 100644 parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java new file mode 100644 index 0000000000..ced55c4f91 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java @@ -0,0 +1,531 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.listType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.variantType; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; +import org.apache.parquet.example.Paper; +import org.apache.parquet.format.ConvertedType; +import org.apache.parquet.format.DecimalType; +import org.apache.parquet.format.FieldRepetitionType; +import org.apache.parquet.format.GeographyType; +import org.apache.parquet.format.GeometryType; +import org.apache.parquet.format.LogicalType; +import org.apache.parquet.format.MapType; +import org.apache.parquet.format.SchemaElement; +import org.apache.parquet.format.StringType; +import org.apache.parquet.format.Type; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestParquetSchemaConverter { + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + @Test + public void testSchemaConverter() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(Paper.schema, schema); + } + + @Test + public void testSchemaConverterDecimal() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + List schemaElements = parquetMetadataConverter.toParquetSchema(Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(OriginalType.DECIMAL) + .precision(9) + .scale(2) + .named("aBinaryDecimal") + .optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(4) + .as(OriginalType.DECIMAL) + .precision(9) + .scale(2) + .named("aFixedDecimal") + .named("Message")); + List expected = List.of( + new SchemaElement("Message").setNum_children(2), + new SchemaElement("aBinaryDecimal") + .setRepetition_type(FieldRepetitionType.REQUIRED) + .setType(org.apache.parquet.format.Type.BYTE_ARRAY) + .setConverted_type(ConvertedType.DECIMAL) + .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) + .setPrecision(9) + .setScale(2), + new SchemaElement("aFixedDecimal") + .setRepetition_type(FieldRepetitionType.OPTIONAL) + .setType(Type.FIXED_LEN_BYTE_ARRAY) + .setType_length(4) + .setConverted_type(ConvertedType.DECIMAL) + .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) + .setPrecision(9) + .setScale(2)); + Assert.assertEquals(expected, schemaElements); + } + + @Test + public void testLogicalTypesBackwardCompatibleWithConvertedTypes() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + MessageType expected = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(OriginalType.DECIMAL) + .precision(9) + .scale(2) + .named("aBinaryDecimal") + .named("Message"); + List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); + // Set logical type field to null to test backward compatibility with files written by older API, + // where converted_types are written to the metadata, but logicalType is missing + parquetSchema.get(1).setLogicalType(null); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(expected, schema); + } + + @Test + public void testIncompatibleLogicalAndConvertedTypes() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + MessageType schema = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(OriginalType.DECIMAL) + .precision(9) + .scale(2) + .named("aBinary") + .named("Message"); + MessageType expected = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.jsonType()) + .named("aBinary") + .named("Message"); + + List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); + // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type + parquetSchema.get(1).setConverted_type(ConvertedType.JSON); + MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(expected, actual); + } + + @Test + public void testTimeLogicalTypes() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + MessageType expected = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(false, MILLIS)) + .named("aTimestampNonUtcMillis") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(true, MILLIS)) + .named("aTimestampUtcMillis") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(false, MICROS)) + .named("aTimestampNonUtcMicros") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(true, MICROS)) + .named("aTimestampUtcMicros") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(false, NANOS)) + .named("aTimestampNonUtcNanos") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timestampType(true, NANOS)) + .named("aTimestampUtcNanos") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .as(timeType(false, MILLIS)) + .named("aTimeNonUtcMillis") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .as(timeType(true, MILLIS)) + .named("aTimeUtcMillis") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timeType(false, MICROS)) + .named("aTimeNonUtcMicros") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timeType(true, MICROS)) + .named("aTimeUtcMicros") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timeType(false, NANOS)) + .named("aTimeNonUtcNanos") + .required(PrimitiveType.PrimitiveTypeName.INT64) + .as(timeType(true, NANOS)) + .named("aTimeUtcNanos") + .named("Message"); + List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(expected, schema); + } + + @Test + public void testLogicalToConvertedTypeConversion() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + + assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType())); + assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType())); + + assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true))); + assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true))); + assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true))); + assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true))); + assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false))); + assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false))); + assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false))); + assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false))); + assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16))); + + assertEquals( + ConvertedType.TIMESTAMP_MILLIS, + parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS))); + assertEquals( + ConvertedType.TIMESTAMP_MICROS, + parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS))); + assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS))); + assertEquals( + ConvertedType.TIMESTAMP_MILLIS, + parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS))); + assertEquals( + ConvertedType.TIMESTAMP_MICROS, + parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS))); + assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS))); + + assertEquals( + ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS))); + assertEquals( + ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS))); + assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS))); + assertEquals( + ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS))); + assertEquals( + ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS))); + assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS))); + + assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType())); + + assertEquals( + ConvertedType.INTERVAL, + parquetMetadataConverter.convertToConvertedType( + LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance())); + assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType())); + assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType())); + + assertNull(parquetMetadataConverter.convertToConvertedType(uuidType())); + + assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType())); + assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType())); + assertEquals( + ConvertedType.MAP_KEY_VALUE, + parquetMetadataConverter.convertToConvertedType( + LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())); + } + + @Test + public void testEnumEquivalence() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + for (org.apache.parquet.schema.Type.Repetition repetition : + org.apache.parquet.schema.Type.Repetition.values()) { + assertEquals( + repetition, + parquetMetadataConverter.fromParquetRepetition( + parquetMetadataConverter.toParquetRepetition(repetition))); + } + for (FieldRepetitionType repetition : FieldRepetitionType.values()) { + assertEquals( + repetition, + parquetMetadataConverter.toParquetRepetition( + parquetMetadataConverter.fromParquetRepetition(repetition))); + } + for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) { + assertEquals( + primitiveTypeName, + parquetMetadataConverter.getPrimitive(parquetMetadataConverter.getType(primitiveTypeName))); + } + for (Type type : Type.values()) { + assertEquals(type, parquetMetadataConverter.getType(parquetMetadataConverter.getPrimitive(type))); + } + for (OriginalType original : OriginalType.values()) { + assertEquals( + original, + parquetMetadataConverter + .getLogicalTypeAnnotation( + parquetMetadataConverter.convertToConvertedType( + LogicalTypeAnnotation.fromOriginalType(original, null)), + null) + .toOriginalType()); + } + for (ConvertedType converted : ConvertedType.values()) { + assertEquals( + converted, + parquetMetadataConverter.convertToConvertedType( + parquetMetadataConverter.getLogicalTypeAnnotation(converted, null))); + } + } + + @Test + public void testMapLogicalType() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + MessageType expected = Types.buildMessage() + .requiredGroup() + .as(mapType()) + .repeatedGroup() + .as(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(stringType()) + .named("key") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("value") + .named("key_value") + .named("testMap") + .named("Message"); + + List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); + assertEquals(5, parquetSchema.size()); + assertEquals(new SchemaElement("Message").setNum_children(1), parquetSchema.get(0)); + assertEquals( + new SchemaElement("testMap") + .setRepetition_type(FieldRepetitionType.REQUIRED) + .setNum_children(1) + .setConverted_type(ConvertedType.MAP) + .setLogicalType(LogicalType.MAP(new MapType())), + parquetSchema.get(1)); + // PARQUET-1879 ensure that LogicalType is not written (null) but ConvertedType is MAP_KEY_VALUE for + // backwards-compatibility + assertEquals( + new SchemaElement("key_value") + .setRepetition_type(FieldRepetitionType.REPEATED) + .setNum_children(2) + .setConverted_type(ConvertedType.MAP_KEY_VALUE) + .setLogicalType(null), + parquetSchema.get(2)); + assertEquals( + new SchemaElement("key") + .setType(Type.BYTE_ARRAY) + .setRepetition_type(FieldRepetitionType.REQUIRED) + .setConverted_type(ConvertedType.UTF8) + .setLogicalType(LogicalType.STRING(new StringType())), + parquetSchema.get(3)); + assertEquals( + new SchemaElement("value") + .setType(Type.INT32) + .setRepetition_type(FieldRepetitionType.REQUIRED) + .setConverted_type(null) + .setLogicalType(null), + parquetSchema.get(4)); + + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(expected, schema); + } + + @Test + public void testVariantLogicalType() { + byte specVersion = 1; + MessageType expected = Types.buildMessage() + .requiredGroup() + .as(variantType(specVersion)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .named("v") + .named("example"); + + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + assertEquals(expected, schema); + LogicalTypeAnnotation logicalType = schema.getType("v").getLogicalTypeAnnotation(); + assertEquals(LogicalTypeAnnotation.variantType(specVersion), logicalType); + assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) logicalType).getSpecVersion()); + } + + @Test + public void testGeometryLogicalType() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + + // Create schema with geometry type + MessageType schema = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geometryType("EPSG:4326")) + .named("geomField") + .named("Message"); + + // Convert to parquet schema and back + List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); + MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + + // Verify the logical type is preserved + assertEquals(schema, actual); + + PrimitiveType primitiveType = actual.getType("geomField").asPrimitiveType(); + LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); + assertTrue(logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); + assertEquals("EPSG:4326", ((LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType).getCrs()); + } + + @Test + public void testGeographyLogicalType() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + + // Create schema with geography type + MessageType schema = Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.geographyType("EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL)) + .named("geogField") + .named("Message"); + + // Convert to parquet schema and back + List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); + MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); + + // Verify the logical type is preserved + assertEquals(schema, actual); + + PrimitiveType primitiveType = actual.getType("geogField").asPrimitiveType(); + LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); + assertTrue(logicalType instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); + + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) logicalType; + assertEquals("EPSG:4326", geographyType.getCrs()); + assertEquals(EdgeInterpolationAlgorithm.SPHERICAL, geographyType.getAlgorithm()); + } + + @Test + public void testGeometryLogicalTypeWithMissingCrs() { + // Create a Geometry logical type without specifying CRS + GeometryType geometryType = new GeometryType(); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOMETRY(geometryType); + + // Convert to LogicalTypeAnnotation + ParquetSchemaConverter converter = new ParquetSchemaConverter(); + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + assertNotNull("Geometry annotation should not be null", annotation); + assertTrue( + "Should be a GeometryLogicalTypeAnnotation", + annotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); + + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryAnnotation = + (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) annotation; + + // Default behavior should use null or empty CRS + assertNull("CRS should be null or empty when not specified", geometryAnnotation.getCrs()); + } + + @Test + public void testGeographyLogicalTypeWithMissingParameters() { + ParquetSchemaConverter converter = new ParquetSchemaConverter(); + + // Create a Geography logical type without CRS and algorithm + GeographyType geographyType = new GeographyType(); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOGRAPHY(geographyType); + + // Convert to LogicalTypeAnnotation + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + assertNotNull("Geography annotation should not be null", annotation); + assertTrue( + "Should be a GeographyLogicalTypeAnnotation", + annotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); + + // Check that optional parameters are handled correctly + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; + assertNull("CRS should be null when not specified", geographyAnnotation.getCrs()); + // Most implementations default to LINEAR when algorithm is not specified + assertNull("Algorithm should be null when not specified", geographyAnnotation.getAlgorithm()); + + // Now test the round-trip conversion + LogicalType roundTripType = converter.convertToLogicalType(annotation); + assertEquals("setField should be GEOGRAPHY", LogicalType._Fields.GEOGRAPHY, roundTripType.getSetField()); + assertNull( + "Round trip CRS should still be null", + roundTripType.getGEOGRAPHY().getCrs()); + assertNull( + "Round trip Algorithm should be null", + roundTripType.getGEOGRAPHY().getAlgorithm()); + } + + @Test + public void testGeographyLogicalTypeWithAlgorithmButNoCrs() { + // Create a Geography logical type with algorithm but no CRS + GeographyType geographyType = new GeographyType(); + geographyType.setAlgorithm(org.apache.parquet.format.EdgeInterpolationAlgorithm.SPHERICAL); + LogicalType logicalType = new LogicalType(); + logicalType.setGEOGRAPHY(geographyType); + + // Convert to LogicalTypeAnnotation + ParquetSchemaConverter converter = new ParquetSchemaConverter(); + LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); + + // Verify the annotation is created correctly + Assert.assertNotNull("Geography annotation should not be null", annotation); + LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = + (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; + + // CRS should be null/empty but algorithm should be set + assertNull("CRS should be null or empty", geographyAnnotation.getCrs()); + assertEquals( + "Algorithm should be SPHERICAL", + EdgeInterpolationAlgorithm.SPHERICAL, + geographyAnnotation.getAlgorithm()); + } + + @Test + public void testEdgeInterpolationAlgorithmConversion() { + // Test conversion from Parquet to Thrift enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo = + ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); + + // convert the Thrift enum to the column schema enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected = + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual = + ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); + assertEquals(expected, actual); + + // Test with null + assertNull(ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(null)); + assertNull(ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(null)); + } +} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 2529f06ada..ee18b741ea 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -26,22 +26,8 @@ import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint; import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart; import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset; -import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; -import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; -import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS; -import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.listType; import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType; import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType; -import static org.apache.parquet.schema.LogicalTypeAnnotation.variantType; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -51,7 +37,6 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; import it.unimi.dsi.fastutil.longs.LongArrayList; import java.io.ByteArrayInputStream; @@ -80,7 +65,6 @@ import org.apache.parquet.column.Encoding; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.ParquetProperties; -import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.DoubleStatistics; @@ -94,26 +78,19 @@ import org.apache.parquet.crypto.EncryptionPropertiesFactory; import org.apache.parquet.crypto.FileDecryptionProperties; import org.apache.parquet.crypto.InternalFileDecryptor; -import org.apache.parquet.example.Paper; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroup; import org.apache.parquet.format.BoundingBox; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; import org.apache.parquet.format.ConvertedType; -import org.apache.parquet.format.DecimalType; import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.FileMetaData; -import org.apache.parquet.format.GeographyType; -import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.GeospatialStatistics; -import org.apache.parquet.format.LogicalType; -import org.apache.parquet.format.MapType; import org.apache.parquet.format.PageHeader; import org.apache.parquet.format.PageType; import org.apache.parquet.format.RowGroup; import org.apache.parquet.format.SchemaElement; -import org.apache.parquet.format.StringType; import org.apache.parquet.format.Type; import org.apache.parquet.format.Util; import org.apache.parquet.hadoop.ParquetReader; @@ -135,6 +112,7 @@ import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.ParquetSchemaConverter; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; @@ -166,50 +144,6 @@ public void testPageHeader() throws IOException { assertEquals(pageHeader, readPageHeader); } - @Test - public void testSchemaConverter() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(Paper.schema, schema); - } - - @Test - public void testSchemaConverterDecimal() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - List schemaElements = parquetMetadataConverter.toParquetSchema(Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(OriginalType.DECIMAL) - .precision(9) - .scale(2) - .named("aBinaryDecimal") - .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) - .length(4) - .as(OriginalType.DECIMAL) - .precision(9) - .scale(2) - .named("aFixedDecimal") - .named("Message")); - List expected = Lists.newArrayList( - new SchemaElement("Message").setNum_children(2), - new SchemaElement("aBinaryDecimal") - .setRepetition_type(FieldRepetitionType.REQUIRED) - .setType(Type.BYTE_ARRAY) - .setConverted_type(ConvertedType.DECIMAL) - .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) - .setPrecision(9) - .setScale(2), - new SchemaElement("aFixedDecimal") - .setRepetition_type(FieldRepetitionType.OPTIONAL) - .setType(Type.FIXED_LEN_BYTE_ARRAY) - .setType_length(4) - .setConverted_type(ConvertedType.DECIMAL) - .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) - .setPrecision(9) - .setScale(2)); - Assert.assertEquals(expected, schemaElements); - } - @Test public void testParquetMetadataConverterWithDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN); @@ -318,155 +252,6 @@ public void testBloomFilterLength() throws IOException { 1024, convertedMetaData.getBlocks().get(0).getColumns().get(0).getBloomFilterLength()); } - @Test - public void testLogicalTypesBackwardCompatibleWithConvertedTypes() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - MessageType expected = Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(OriginalType.DECIMAL) - .precision(9) - .scale(2) - .named("aBinaryDecimal") - .named("Message"); - List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); - // Set logical type field to null to test backward compatibility with files written by older API, - // where converted_types are written to the metadata, but logicalType is missing - parquetSchema.get(1).setLogicalType(null); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(expected, schema); - } - - @Test - public void testIncompatibleLogicalAndConvertedTypes() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - MessageType schema = Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(OriginalType.DECIMAL) - .precision(9) - .scale(2) - .named("aBinary") - .named("Message"); - MessageType expected = Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(LogicalTypeAnnotation.jsonType()) - .named("aBinary") - .named("Message"); - - List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); - // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type - parquetSchema.get(1).setConverted_type(ConvertedType.JSON); - MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(expected, actual); - } - - @Test - public void testTimeLogicalTypes() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - MessageType expected = Types.buildMessage() - .required(PrimitiveTypeName.INT64) - .as(timestampType(false, MILLIS)) - .named("aTimestampNonUtcMillis") - .required(PrimitiveTypeName.INT64) - .as(timestampType(true, MILLIS)) - .named("aTimestampUtcMillis") - .required(PrimitiveTypeName.INT64) - .as(timestampType(false, MICROS)) - .named("aTimestampNonUtcMicros") - .required(PrimitiveTypeName.INT64) - .as(timestampType(true, MICROS)) - .named("aTimestampUtcMicros") - .required(PrimitiveTypeName.INT64) - .as(timestampType(false, NANOS)) - .named("aTimestampNonUtcNanos") - .required(PrimitiveTypeName.INT64) - .as(timestampType(true, NANOS)) - .named("aTimestampUtcNanos") - .required(PrimitiveTypeName.INT32) - .as(timeType(false, MILLIS)) - .named("aTimeNonUtcMillis") - .required(PrimitiveTypeName.INT32) - .as(timeType(true, MILLIS)) - .named("aTimeUtcMillis") - .required(PrimitiveTypeName.INT64) - .as(timeType(false, MICROS)) - .named("aTimeNonUtcMicros") - .required(PrimitiveTypeName.INT64) - .as(timeType(true, MICROS)) - .named("aTimeUtcMicros") - .required(PrimitiveTypeName.INT64) - .as(timeType(false, NANOS)) - .named("aTimeNonUtcNanos") - .required(PrimitiveTypeName.INT64) - .as(timeType(true, NANOS)) - .named("aTimeUtcNanos") - .named("Message"); - List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(expected, schema); - } - - @Test - public void testLogicalToConvertedTypeConversion() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - - assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType())); - assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType())); - - assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true))); - assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true))); - assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true))); - assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true))); - assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false))); - assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false))); - assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false))); - assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false))); - assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16))); - - assertEquals( - ConvertedType.TIMESTAMP_MILLIS, - parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS))); - assertEquals( - ConvertedType.TIMESTAMP_MICROS, - parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS))); - assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS))); - assertEquals( - ConvertedType.TIMESTAMP_MILLIS, - parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS))); - assertEquals( - ConvertedType.TIMESTAMP_MICROS, - parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS))); - assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS))); - - assertEquals( - ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS))); - assertEquals( - ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS))); - assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS))); - assertEquals( - ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS))); - assertEquals( - ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS))); - assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS))); - - assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType())); - - assertEquals( - ConvertedType.INTERVAL, - parquetMetadataConverter.convertToConvertedType( - LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance())); - assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType())); - assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType())); - - assertNull(parquetMetadataConverter.convertToConvertedType(uuidType())); - - assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType())); - assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType())); - assertEquals( - ConvertedType.MAP_KEY_VALUE, - parquetMetadataConverter.convertToConvertedType( - LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())); - } - @Test public void testEnumEquivalence() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); @@ -478,42 +263,6 @@ public void testEnumEquivalence() { assertEquals( encoding, parquetMetadataConverter.getEncoding(parquetMetadataConverter.getEncoding(encoding))); } - for (Repetition repetition : Repetition.values()) { - assertEquals( - repetition, - parquetMetadataConverter.fromParquetRepetition( - parquetMetadataConverter.toParquetRepetition(repetition))); - } - for (FieldRepetitionType repetition : FieldRepetitionType.values()) { - assertEquals( - repetition, - parquetMetadataConverter.toParquetRepetition( - parquetMetadataConverter.fromParquetRepetition(repetition))); - } - for (PrimitiveTypeName primitiveTypeName : PrimitiveTypeName.values()) { - assertEquals( - primitiveTypeName, - parquetMetadataConverter.getPrimitive(parquetMetadataConverter.getType(primitiveTypeName))); - } - for (Type type : Type.values()) { - assertEquals(type, parquetMetadataConverter.getType(parquetMetadataConverter.getPrimitive(type))); - } - for (OriginalType original : OriginalType.values()) { - assertEquals( - original, - parquetMetadataConverter - .getLogicalTypeAnnotation( - parquetMetadataConverter.convertToConvertedType( - LogicalTypeAnnotation.fromOriginalType(original, null)), - null) - .toOriginalType()); - } - for (ConvertedType converted : ConvertedType.values()) { - assertEquals( - converted, - parquetMetadataConverter.convertToConvertedType( - parquetMetadataConverter.getLogicalTypeAnnotation(converted, null))); - } } private FileMetaData metadata(long... sizes) { @@ -1491,61 +1240,6 @@ public void testColumnIndexConversion() { } } - @Test - public void testMapLogicalType() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - MessageType expected = Types.buildMessage() - .requiredGroup() - .as(mapType()) - .repeatedGroup() - .as(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()) - .required(PrimitiveTypeName.BINARY) - .as(stringType()) - .named("key") - .required(PrimitiveTypeName.INT32) - .named("value") - .named("key_value") - .named("testMap") - .named("Message"); - - List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); - assertEquals(5, parquetSchema.size()); - assertEquals(new SchemaElement("Message").setNum_children(1), parquetSchema.get(0)); - assertEquals( - new SchemaElement("testMap") - .setRepetition_type(FieldRepetitionType.REQUIRED) - .setNum_children(1) - .setConverted_type(ConvertedType.MAP) - .setLogicalType(LogicalType.MAP(new MapType())), - parquetSchema.get(1)); - // PARQUET-1879 ensure that LogicalType is not written (null) but ConvertedType is MAP_KEY_VALUE for - // backwards-compatibility - assertEquals( - new SchemaElement("key_value") - .setRepetition_type(FieldRepetitionType.REPEATED) - .setNum_children(2) - .setConverted_type(ConvertedType.MAP_KEY_VALUE) - .setLogicalType(null), - parquetSchema.get(2)); - assertEquals( - new SchemaElement("key") - .setType(Type.BYTE_ARRAY) - .setRepetition_type(FieldRepetitionType.REQUIRED) - .setConverted_type(ConvertedType.UTF8) - .setLogicalType(LogicalType.STRING(new StringType())), - parquetSchema.get(3)); - assertEquals( - new SchemaElement("value") - .setType(Type.INT32) - .setRepetition_type(FieldRepetitionType.REQUIRED) - .setConverted_type(null) - .setLogicalType(null), - parquetSchema.get(4)); - - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(expected, schema); - } - @Test public void testMapLogicalTypeReadWrite() throws Exception { MessageType messageType = Types.buildMessage() @@ -1590,34 +1284,12 @@ public void testMapConvertedTypeReadWrite() throws Exception { .setConverted_type(null) .setLogicalType(null)); - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - MessageType messageType = parquetMetadataConverter.fromParquetSchema(oldConvertedTypeSchemaElements, null); + ParquetSchemaConverter parquetSchemaConverter = new ParquetSchemaConverter(); + MessageType messageType = parquetSchemaConverter.fromParquetSchema(oldConvertedTypeSchemaElements, null); verifyMapMessageType(messageType, "map"); } - @Test - public void testVariantLogicalType() { - byte specVersion = 1; - MessageType expected = Types.buildMessage() - .requiredGroup() - .as(variantType(specVersion)) - .required(PrimitiveTypeName.BINARY) - .named("metadata") - .required(PrimitiveTypeName.BINARY) - .named("value") - .named("v") - .named("example"); - - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - List parquetSchema = parquetMetadataConverter.toParquetSchema(expected); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - assertEquals(expected, schema); - LogicalTypeAnnotation logicalType = schema.getType("v").getLogicalTypeAnnotation(); - assertEquals(LogicalTypeAnnotation.variantType(specVersion), logicalType); - assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) logicalType).getSpecVersion()); - } - private void verifyMapMessageType(final MessageType messageType, final String keyValueName) throws IOException { Path file = new Path(temporaryFolder.newFolder("verifyMapMessageType").getPath(), keyValueName + ".parquet"); @@ -1668,143 +1340,6 @@ public void testSizeStatisticsConversion() { assertEquals(defLevelHistogram, sizeStatistics.getDefinitionLevelHistogram()); } - @Test - public void testGeometryLogicalType() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - - // Create schema with geometry type - MessageType schema = Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(LogicalTypeAnnotation.geometryType("EPSG:4326")) - .named("geomField") - .named("Message"); - - // Convert to parquet schema and back - List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); - MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - - // Verify the logical type is preserved - assertEquals(schema, actual); - - PrimitiveType primitiveType = actual.getType("geomField").asPrimitiveType(); - LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); - assertTrue(logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); - assertEquals("EPSG:4326", ((LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType).getCrs()); - } - - @Test - public void testGeographyLogicalType() { - ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); - - // Create schema with geography type - MessageType schema = Types.buildMessage() - .required(PrimitiveTypeName.BINARY) - .as(LogicalTypeAnnotation.geographyType("EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL)) - .named("geogField") - .named("Message"); - - // Convert to parquet schema and back - List parquetSchema = parquetMetadataConverter.toParquetSchema(schema); - MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); - - // Verify the logical type is preserved - assertEquals(schema, actual); - - PrimitiveType primitiveType = actual.getType("geogField").asPrimitiveType(); - LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation(); - assertTrue(logicalType instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); - - LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType = - (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) logicalType; - assertEquals("EPSG:4326", geographyType.getCrs()); - assertEquals(EdgeInterpolationAlgorithm.SPHERICAL, geographyType.getAlgorithm()); - } - - @Test - public void testGeometryLogicalTypeWithMissingCrs() { - // Create a Geometry logical type without specifying CRS - GeometryType geometryType = new GeometryType(); - LogicalType logicalType = new LogicalType(); - logicalType.setGEOMETRY(geometryType); - - // Convert to LogicalTypeAnnotation - ParquetMetadataConverter converter = new ParquetMetadataConverter(); - LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); - - // Verify the annotation is created correctly - assertNotNull("Geometry annotation should not be null", annotation); - assertTrue( - "Should be a GeometryLogicalTypeAnnotation", - annotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation); - - LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryAnnotation = - (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) annotation; - - // Default behavior should use null or empty CRS - assertNull("CRS should be null or empty when not specified", geometryAnnotation.getCrs()); - } - - @Test - public void testGeographyLogicalTypeWithMissingParameters() { - ParquetMetadataConverter converter = new ParquetMetadataConverter(); - - // Create a Geography logical type without CRS and algorithm - GeographyType geographyType = new GeographyType(); - LogicalType logicalType = new LogicalType(); - logicalType.setGEOGRAPHY(geographyType); - - // Convert to LogicalTypeAnnotation - LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); - - // Verify the annotation is created correctly - assertNotNull("Geography annotation should not be null", annotation); - assertTrue( - "Should be a GeographyLogicalTypeAnnotation", - annotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation); - - // Check that optional parameters are handled correctly - LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = - (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; - assertNull("CRS should be null when not specified", geographyAnnotation.getCrs()); - // Most implementations default to LINEAR when algorithm is not specified - assertNull("Algorithm should be null when not specified", geographyAnnotation.getAlgorithm()); - - // Now test the round-trip conversion - LogicalType roundTripType = converter.convertToLogicalType(annotation); - assertEquals("setField should be GEOGRAPHY", LogicalType._Fields.GEOGRAPHY, roundTripType.getSetField()); - assertNull( - "Round trip CRS should still be null", - roundTripType.getGEOGRAPHY().getCrs()); - assertNull( - "Round trip Algorithm should be null", - roundTripType.getGEOGRAPHY().getAlgorithm()); - } - - @Test - public void testGeographyLogicalTypeWithAlgorithmButNoCrs() { - // Create a Geography logical type with algorithm but no CRS - GeographyType geographyType = new GeographyType(); - geographyType.setAlgorithm(org.apache.parquet.format.EdgeInterpolationAlgorithm.SPHERICAL); - LogicalType logicalType = new LogicalType(); - logicalType.setGEOGRAPHY(geographyType); - - // Convert to LogicalTypeAnnotation - ParquetMetadataConverter converter = new ParquetMetadataConverter(); - LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType); - - // Verify the annotation is created correctly - Assert.assertNotNull("Geography annotation should not be null", annotation); - LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation = - (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation; - - // CRS should be null/empty but algorithm should be set - assertNull("CRS should be null or empty", geographyAnnotation.getCrs()); - assertEquals( - "Algorithm should be SPHERICAL", - EdgeInterpolationAlgorithm.SPHERICAL, - geographyAnnotation.getAlgorithm()); - } - @Test public void testGeospatialStatisticsConversion() { // Create a ParquetMetadataConverter @@ -1855,8 +1390,9 @@ public void testGeospatialStatisticsConversion() { // Create primitive geometry type for conversion back LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326"); - PrimitiveType geometryType = - Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry"); + PrimitiveType geometryType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(geometryAnnotation) + .named("geometry"); // Convert back from Thrift format org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats = @@ -1906,8 +1442,9 @@ public void testGeospatialStatisticsWithNullBoundingBox() { // Create primitive geometry type for conversion back LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326"); - PrimitiveType geometryType = - Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry"); + PrimitiveType geometryType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(geometryAnnotation) + .named("geometry"); // Convert back from Thrift format org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats = @@ -1942,23 +1479,4 @@ public void testInvalidBoundingBox() { GeospatialStatistics thriftStats = converter.toParquetGeospatialStatistics(origStats); assertNull("Should return null for invalid BoundingBox", thriftStats); } - - @Test - public void testEdgeInterpolationAlgorithmConversion() { - // Test conversion from Parquet to Thrift enum - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL; - org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo = - ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); - - // convert the Thrift enum to the column schema enum - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected = - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL; - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual = - ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); - assertEquals(expected, actual); - - // Test with null - assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null)); - assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null)); - } } From c85c96300d4f725d6543353b7f132a245a46f451 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 14:03:03 +0200 Subject: [PATCH 5/6] refactor enum conversion logic into separate class --- .../converters/ParquetEnumConverter.java | 122 ++++++++++++++++++ .../ParquetSchemaConverter.java | 120 +++-------------- .../converters/TestParquetEnumConverter.java | 93 +++++++++++++ .../TestParquetSchemaConverter.java | 68 +--------- .../converter/ParquetMetadataConverter.java | 20 ++- .../TestParquetMetadataConverter.java | 2 +- 6 files changed, 258 insertions(+), 167 deletions(-) create mode 100644 parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java rename parquet-column/src/main/java/org/apache/parquet/schema/{ => converters}/ParquetSchemaConverter.java (85%) create mode 100644 parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java rename parquet-column/src/test/java/org/apache/parquet/schema/{ => converters}/TestParquetSchemaConverter.java (88%) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java new file mode 100644 index 0000000000..ed458c9d6a --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema.converters; + +import org.apache.parquet.format.EdgeInterpolationAlgorithm; +import org.apache.parquet.format.FieldRepetitionType; +import org.apache.parquet.format.TimeUnit; +import org.apache.parquet.format.Type; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; + +/** + * This class contains logic to convert between thrift (parquet-format) generated enums and enums in parquet-column. + */ +public class ParquetEnumConverter { + public static PrimitiveType.PrimitiveTypeName getPrimitive(Type type) { + switch (type) { + case BYTE_ARRAY: // TODO: rename BINARY and remove this switch + return PrimitiveType.PrimitiveTypeName.BINARY; + case INT64: + return PrimitiveType.PrimitiveTypeName.INT64; + case INT32: + return PrimitiveType.PrimitiveTypeName.INT32; + case BOOLEAN: + return PrimitiveType.PrimitiveTypeName.BOOLEAN; + case FLOAT: + return PrimitiveType.PrimitiveTypeName.FLOAT; + case DOUBLE: + return PrimitiveType.PrimitiveTypeName.DOUBLE; + case INT96: + return PrimitiveType.PrimitiveTypeName.INT96; + case FIXED_LEN_BYTE_ARRAY: + return PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + default: + throw new RuntimeException("Unknown type " + type); + } + } + + public static Type getType(PrimitiveType.PrimitiveTypeName type) { + switch (type) { + case INT64: + return Type.INT64; + case INT32: + return Type.INT32; + case BOOLEAN: + return Type.BOOLEAN; + case BINARY: + return Type.BYTE_ARRAY; + case FLOAT: + return Type.FLOAT; + case DOUBLE: + return Type.DOUBLE; + case INT96: + return Type.INT96; + case FIXED_LEN_BYTE_ARRAY: + return Type.FIXED_LEN_BYTE_ARRAY; + default: + throw new RuntimeException("Unknown primitive type " + type); + } + } + + /** Convert Parquet Algorithm enum to Thrift Algorithm enum */ + public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { + if (parquetAlgo == null) { + return null; + } + EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue()); + if (thriftAlgo == null) { + throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo); + } + return thriftAlgo; + } + + /** Convert Thrift Algorithm enum to Parquet Algorithm enum */ + public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm thriftAlgo) { + if (thriftAlgo == null) { + return null; + } + return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue()); + } + + // Visible for testing + static FieldRepetitionType toParquetRepetition(org.apache.parquet.schema.Type.Repetition repetition) { + return FieldRepetitionType.valueOf(repetition.name()); + } + + // Visible for testing + static org.apache.parquet.schema.Type.Repetition fromParquetRepetition(FieldRepetitionType repetition) { + return org.apache.parquet.schema.Type.Repetition.valueOf(repetition.name()); + } + + static LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { + switch (unit.getSetField()) { + case MICROS: + return LogicalTypeAnnotation.TimeUnit.MICROS; + case MILLIS: + return LogicalTypeAnnotation.TimeUnit.MILLIS; + case NANOS: + return LogicalTypeAnnotation.TimeUnit.NANOS; + default: + throw new RuntimeException("Unknown time unit " + unit); + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java similarity index 85% rename from parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java rename to parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java index bd1404893e..801edcec71 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/ParquetSchemaConverter.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java @@ -16,10 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.schema; +package org.apache.parquet.schema.converters; import static java.util.Optional.empty; import static java.util.Optional.of; +import static org.apache.parquet.schema.converters.ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm; +import static org.apache.parquet.schema.converters.ParquetEnumConverter.getPrimitive; +import static org.apache.parquet.schema.converters.ParquetEnumConverter.getType; +import static org.apache.parquet.schema.converters.ParquetEnumConverter.toParquetRepetition; import java.util.ArrayList; import java.util.Iterator; @@ -28,8 +32,6 @@ import org.apache.parquet.format.ColumnOrder; import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.DecimalType; -import org.apache.parquet.format.EdgeInterpolationAlgorithm; -import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.GeographyType; import org.apache.parquet.format.GeometryType; import org.apache.parquet.format.IntType; @@ -45,6 +47,13 @@ import org.apache.parquet.format.Type; import org.apache.parquet.format.TypeDefinedOrder; import org.apache.parquet.format.VariantType; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.TypeVisitor; +import org.apache.parquet.schema.Types; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,7 +101,8 @@ private void buildChildren( Types.Builder childBuilder; if (schemaElement.type != null) { Types.PrimitiveBuilder primitiveBuilder = builder.primitive( - getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type)); + getPrimitive(schemaElement.type), + ParquetEnumConverter.fromParquetRepetition(schemaElement.repetition_type)); if (schemaElement.isSetType_length()) { primitiveBuilder.length(schemaElement.type_length); } @@ -118,7 +128,7 @@ private void buildChildren( } childBuilder = primitiveBuilder; } else { - childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); + childBuilder = builder.group(ParquetEnumConverter.fromParquetRepetition(schemaElement.repetition_type)); buildChildren( (Types.GroupBuilder) childBuilder, schema, @@ -156,16 +166,6 @@ && getLogicalTypeAnnotation(schemaElement.logicalType) != null) } } - // Visible for testing - FieldRepetitionType toParquetRepetition(org.apache.parquet.schema.Type.Repetition repetition) { - return FieldRepetitionType.valueOf(repetition.name()); - } - - // Visible for testing - org.apache.parquet.schema.Type.Repetition fromParquetRepetition(FieldRepetitionType repetition) { - return org.apache.parquet.schema.Type.Repetition.valueOf(repetition.name()); - } - private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) { if (columnOrder.isSetTYPE_ORDER()) { return org.apache.parquet.schema.ColumnOrder.typeDefined(); @@ -174,52 +174,6 @@ private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(Colu return org.apache.parquet.schema.ColumnOrder.undefined(); } - public PrimitiveType.PrimitiveTypeName getPrimitive(Type type) { - switch (type) { - case BYTE_ARRAY: // TODO: rename BINARY and remove this switch - return PrimitiveType.PrimitiveTypeName.BINARY; - case INT64: - return PrimitiveType.PrimitiveTypeName.INT64; - case INT32: - return PrimitiveType.PrimitiveTypeName.INT32; - case BOOLEAN: - return PrimitiveType.PrimitiveTypeName.BOOLEAN; - case FLOAT: - return PrimitiveType.PrimitiveTypeName.FLOAT; - case DOUBLE: - return PrimitiveType.PrimitiveTypeName.DOUBLE; - case INT96: - return PrimitiveType.PrimitiveTypeName.INT96; - case FIXED_LEN_BYTE_ARRAY: - return PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown type " + type); - } - } - - public Type getType(PrimitiveType.PrimitiveTypeName type) { - switch (type) { - case INT64: - return Type.INT64; - case INT32: - return Type.INT32; - case BOOLEAN: - return Type.BOOLEAN; - case BINARY: - return Type.BYTE_ARRAY; - case FLOAT: - return Type.FLOAT; - case DOUBLE: - return Type.DOUBLE; - case INT96: - return Type.INT96; - case FIXED_LEN_BYTE_ARRAY: - return Type.FIXED_LEN_BYTE_ARRAY; - default: - throw new RuntimeException("Unknown primitive type " + type); - } - } - // Visible for testing LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { switch (type) { @@ -291,7 +245,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.listType(); case TIME: TimeType time = type.getTIME(); - return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit)); + return LogicalTypeAnnotation.timeType( + time.isAdjustedToUTC, ParquetEnumConverter.convertTimeUnit(time.unit)); case STRING: return LogicalTypeAnnotation.stringType(); case DECIMAL: @@ -304,7 +259,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { return LogicalTypeAnnotation.unknownType(); case TIMESTAMP: TimestampType timestamp = type.getTIMESTAMP(); - return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); + return LogicalTypeAnnotation.timestampType( + timestamp.isAdjustedToUTC, ParquetEnumConverter.convertTimeUnit(timestamp.unit)); case UUID: return LogicalTypeAnnotation.uuidType(); case FLOAT16: @@ -315,7 +271,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { case GEOGRAPHY: GeographyType geography = type.getGEOGRAPHY(); return LogicalTypeAnnotation.geographyType( - geography.getCrs(), toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm())); + geography.getCrs(), + ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm())); case VARIANT: VariantType variant = type.getVARIANT(); return LogicalTypeAnnotation.variantType(variant.getSpecification_version()); @@ -324,41 +281,6 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { } } - private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { - switch (unit.getSetField()) { - case MICROS: - return LogicalTypeAnnotation.TimeUnit.MICROS; - case MILLIS: - return LogicalTypeAnnotation.TimeUnit.MILLIS; - case NANOS: - return LogicalTypeAnnotation.TimeUnit.NANOS; - default: - throw new RuntimeException("Unknown time unit " + unit); - } - } - - /** Convert Parquet Algorithm enum to Thrift Algorithm enum */ - public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { - if (parquetAlgo == null) { - return null; - } - EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue()); - if (thriftAlgo == null) { - throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo); - } - return thriftAlgo; - } - - /** Convert Thrift Algorithm enum to Parquet Algorithm enum */ - public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( - EdgeInterpolationAlgorithm thriftAlgo) { - if (thriftAlgo == null) { - return null; - } - return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue()); - } - /** * Parse parquet-column MessageType and write the {@link ColumnOrder} objects for it. * diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java new file mode 100644 index 0000000000..c8f402a0af --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema.converters; + +import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; +import org.apache.parquet.format.ConvertedType; +import org.apache.parquet.format.FieldRepetitionType; +import org.apache.parquet.format.Type; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class TestParquetEnumConverter +{ + @Test + public void testEnumEquivalence() { + ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); + for (org.apache.parquet.schema.Type.Repetition repetition : + org.apache.parquet.schema.Type.Repetition.values()) { + assertEquals( + repetition, + ParquetEnumConverter.fromParquetRepetition(ParquetEnumConverter.toParquetRepetition(repetition))); + } + for (FieldRepetitionType repetition : FieldRepetitionType.values()) { + assertEquals( + repetition, + ParquetEnumConverter.toParquetRepetition(ParquetEnumConverter.fromParquetRepetition(repetition))); + } + for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) { + assertEquals( + primitiveTypeName, + ParquetEnumConverter.getPrimitive(ParquetEnumConverter.getType(primitiveTypeName))); + } + for (Type type : Type.values()) { + assertEquals(type, ParquetEnumConverter.getType(ParquetEnumConverter.getPrimitive(type))); + } + for (OriginalType original : OriginalType.values()) { + assertEquals( + original, + parquetMetadataConverter + .getLogicalTypeAnnotation( + parquetMetadataConverter.convertToConvertedType( + LogicalTypeAnnotation.fromOriginalType(original, null)), + null) + .toOriginalType()); + } + for (ConvertedType converted : ConvertedType.values()) { + assertEquals( + converted, + parquetMetadataConverter.convertToConvertedType( + parquetMetadataConverter.getLogicalTypeAnnotation(converted, null))); + } + } + + @Test + public void testEdgeInterpolationAlgorithmConversion() { + // Test conversion from Parquet to Thrift enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo = + ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); + + // convert the Thrift enum to the column schema enum + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected = + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL; + org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual = + ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); + assertEquals(expected, actual); + + // Test with null + assertNull(ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(null)); + assertNull(ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(null)); + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java similarity index 88% rename from parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java rename to parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java index ced55c4f91..9b2f6edb27 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestParquetSchemaConverter.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.schema; +package org.apache.parquet.schema.converters; import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; @@ -52,6 +52,11 @@ import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.StringType; import org.apache.parquet.format.Type; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; @@ -254,48 +259,6 @@ public void testLogicalToConvertedTypeConversion() { LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())); } - @Test - public void testEnumEquivalence() { - ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); - for (org.apache.parquet.schema.Type.Repetition repetition : - org.apache.parquet.schema.Type.Repetition.values()) { - assertEquals( - repetition, - parquetMetadataConverter.fromParquetRepetition( - parquetMetadataConverter.toParquetRepetition(repetition))); - } - for (FieldRepetitionType repetition : FieldRepetitionType.values()) { - assertEquals( - repetition, - parquetMetadataConverter.toParquetRepetition( - parquetMetadataConverter.fromParquetRepetition(repetition))); - } - for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) { - assertEquals( - primitiveTypeName, - parquetMetadataConverter.getPrimitive(parquetMetadataConverter.getType(primitiveTypeName))); - } - for (Type type : Type.values()) { - assertEquals(type, parquetMetadataConverter.getType(parquetMetadataConverter.getPrimitive(type))); - } - for (OriginalType original : OriginalType.values()) { - assertEquals( - original, - parquetMetadataConverter - .getLogicalTypeAnnotation( - parquetMetadataConverter.convertToConvertedType( - LogicalTypeAnnotation.fromOriginalType(original, null)), - null) - .toOriginalType()); - } - for (ConvertedType converted : ConvertedType.values()) { - assertEquals( - converted, - parquetMetadataConverter.convertToConvertedType( - parquetMetadataConverter.getLogicalTypeAnnotation(converted, null))); - } - } - @Test public void testMapLogicalType() { ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); @@ -509,23 +472,4 @@ public void testGeographyLogicalTypeWithAlgorithmButNoCrs() { EdgeInterpolationAlgorithm.SPHERICAL, geographyAnnotation.getAlgorithm()); } - - @Test - public void testEdgeInterpolationAlgorithmConversion() { - // Test conversion from Parquet to Thrift enum - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL; - org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo = - ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); - - // convert the Thrift enum to the column schema enum - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected = - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL; - org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual = - ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); - assertEquals(expected, actual); - - // Test with null - assertNull(ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(null)); - assertNull(ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(null)); - } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 883ce6d11a..b3a62abcfa 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -108,10 +108,11 @@ import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.ParquetSchemaConverter; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.converters.ParquetEnumConverter; +import org.apache.parquet.schema.converters.ParquetSchemaConverter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -278,7 +279,7 @@ private void addRowGroup( encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup); } ColumnMetaData metaData = new ColumnMetaData( - parquetSchemaConverter.getType(columnMetaData.getType()), + ParquetEnumConverter.getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), columnMetaData.getPath().toList(), toFormatCodec(columnMetaData.getCodec()), @@ -886,9 +887,12 @@ public Optional visit( return defaultSortOrder(primitive.getPrimitiveTypeName()); } + /** + * @deprecated Please use {@link ParquetEnumConverter#getPrimitive(Type)} instead. + */ @Deprecated public PrimitiveTypeName getPrimitive(Type type) { - return parquetSchemaConverter.getPrimitive(type); + return ParquetEnumConverter.getPrimitive(type); } private static void addKeyValue(FileMetaData fileMetaData, String key, String value) { @@ -2083,15 +2087,21 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s return formatStats; } + /** + * @deprecated Use {@link ParquetEnumConverter#fromParquetEdgeInterpolationAlgorithm(org.apache.parquet.column.schema.EdgeInterpolationAlgorithm)} instead. + */ @Deprecated public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm( org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) { - return ParquetSchemaConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); + return ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo); } + /** + * @deprecated Use {@link ParquetEnumConverter#toParquetEdgeInterpolationAlgorithm(EdgeInterpolationAlgorithm)} instead. + */ @Deprecated public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm( EdgeInterpolationAlgorithm thriftAlgo) { - return ParquetSchemaConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); + return ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index ee18b741ea..10e41219ee 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -112,11 +112,11 @@ import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; -import org.apache.parquet.schema.ParquetSchemaConverter; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.Repetition; import org.apache.parquet.schema.Types; +import org.apache.parquet.schema.converters.ParquetSchemaConverter; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; From 77094f8a9e446ed0ea3ac5a26393324cfd59ac4c Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Thu, 4 Dec 2025 14:53:51 +0200 Subject: [PATCH 6/6] deduplicate OriginalType conversion code --- .../converters/ParquetEnumConverter.java | 6 ++ .../converters/ParquetSchemaConverter.java | 57 ++----------------- .../converters/TestParquetEnumConverter.java | 18 ++++-- 3 files changed, 25 insertions(+), 56 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java index ed458c9d6a..edf3b2ebea 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java @@ -18,11 +18,13 @@ */ package org.apache.parquet.schema.converters; +import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.EdgeInterpolationAlgorithm; import org.apache.parquet.format.FieldRepetitionType; import org.apache.parquet.format.TimeUnit; import org.apache.parquet.format.Type; import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; /** @@ -119,4 +121,8 @@ static LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) { throw new RuntimeException("Unknown time unit " + unit); } } + + static OriginalType toParquetOriginalType(ConvertedType convertedType) { + return OriginalType.valueOf(convertedType.name()); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java index 801edcec71..9ebf28f452 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java @@ -23,6 +23,7 @@ import static org.apache.parquet.schema.converters.ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm; import static org.apache.parquet.schema.converters.ParquetEnumConverter.getPrimitive; import static org.apache.parquet.schema.converters.ParquetEnumConverter.getType; +import static org.apache.parquet.schema.converters.ParquetEnumConverter.toParquetOriginalType; import static org.apache.parquet.schema.converters.ParquetEnumConverter.toParquetRepetition; import java.util.ArrayList; @@ -47,6 +48,7 @@ import org.apache.parquet.format.Type; import org.apache.parquet.format.TypeDefinedOrder; import org.apache.parquet.format.VariantType; +import org.apache.parquet.schema.DecimalMetadata; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; @@ -176,57 +178,10 @@ private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(Colu // Visible for testing LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { - switch (type) { - case UTF8: - return LogicalTypeAnnotation.stringType(); - case MAP: - return LogicalTypeAnnotation.mapType(); - case MAP_KEY_VALUE: - return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); - case LIST: - return LogicalTypeAnnotation.listType(); - case ENUM: - return LogicalTypeAnnotation.enumType(); - case DECIMAL: - int scale = (schemaElement == null ? 0 : schemaElement.scale); - int precision = (schemaElement == null ? 0 : schemaElement.precision); - return LogicalTypeAnnotation.decimalType(scale, precision); - case DATE: - return LogicalTypeAnnotation.dateType(); - case TIME_MILLIS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIME_MICROS: - return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case TIMESTAMP_MILLIS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); - case TIMESTAMP_MICROS: - return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); - case INTERVAL: - return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); - case INT_8: - return LogicalTypeAnnotation.intType(8, true); - case INT_16: - return LogicalTypeAnnotation.intType(16, true); - case INT_32: - return LogicalTypeAnnotation.intType(32, true); - case INT_64: - return LogicalTypeAnnotation.intType(64, true); - case UINT_8: - return LogicalTypeAnnotation.intType(8, false); - case UINT_16: - return LogicalTypeAnnotation.intType(16, false); - case UINT_32: - return LogicalTypeAnnotation.intType(32, false); - case UINT_64: - return LogicalTypeAnnotation.intType(64, false); - case JSON: - return LogicalTypeAnnotation.jsonType(); - case BSON: - return LogicalTypeAnnotation.bsonType(); - default: - throw new RuntimeException( - "Can't convert converted type to logical type, unknown converted type " + type); - } + int scale = (schemaElement == null ? 0 : schemaElement.scale); + int precision = (schemaElement == null ? 0 : schemaElement.precision); + return LogicalTypeAnnotation.fromOriginalType( + toParquetOriginalType(type), new DecimalMetadata(scale, precision)); } LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java index c8f402a0af..5fd0d831e0 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java @@ -18,6 +18,9 @@ */ package org.apache.parquet.schema.converters; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm; import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.FieldRepetitionType; @@ -27,11 +30,7 @@ import org.apache.parquet.schema.PrimitiveType; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; - -public class TestParquetEnumConverter -{ +public class TestParquetEnumConverter { @Test public void testEnumEquivalence() { ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter(); @@ -90,4 +89,13 @@ public void testEdgeInterpolationAlgorithmConversion() { assertNull(ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(null)); assertNull(ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(null)); } + + @Test + public void testConvertedTypeToOriginalType() { + for (ConvertedType convertedType : ConvertedType.values()) { + assertEquals( + convertedType.name(), + ParquetEnumConverter.toParquetOriginalType(convertedType).name()); + } + } }