diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml
index 8bd11dcc26..64ddf6861b 100644
--- a/parquet-column/pom.xml
+++ b/parquet-column/pom.xml
@@ -48,6 +48,11 @@
test-jar
test
+
+ org.apache.parquet
+ parquet-format-structures
+ ${project.version}
+
org.apache.parquet
parquet-encoding
diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java
new file mode 100644
index 0000000000..edf3b2ebea
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetEnumConverter.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.schema.converters;
+
+import org.apache.parquet.format.ConvertedType;
+import org.apache.parquet.format.EdgeInterpolationAlgorithm;
+import org.apache.parquet.format.FieldRepetitionType;
+import org.apache.parquet.format.TimeUnit;
+import org.apache.parquet.format.Type;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+
+/**
+ * This class contains logic to convert between thrift (parquet-format) generated enums and enums in parquet-column.
+ */
+public class ParquetEnumConverter {
+ public static PrimitiveType.PrimitiveTypeName getPrimitive(Type type) {
+ switch (type) {
+ case BYTE_ARRAY: // TODO: rename BINARY and remove this switch
+ return PrimitiveType.PrimitiveTypeName.BINARY;
+ case INT64:
+ return PrimitiveType.PrimitiveTypeName.INT64;
+ case INT32:
+ return PrimitiveType.PrimitiveTypeName.INT32;
+ case BOOLEAN:
+ return PrimitiveType.PrimitiveTypeName.BOOLEAN;
+ case FLOAT:
+ return PrimitiveType.PrimitiveTypeName.FLOAT;
+ case DOUBLE:
+ return PrimitiveType.PrimitiveTypeName.DOUBLE;
+ case INT96:
+ return PrimitiveType.PrimitiveTypeName.INT96;
+ case FIXED_LEN_BYTE_ARRAY:
+ return PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
+ default:
+ throw new RuntimeException("Unknown type " + type);
+ }
+ }
+
+ public static Type getType(PrimitiveType.PrimitiveTypeName type) {
+ switch (type) {
+ case INT64:
+ return Type.INT64;
+ case INT32:
+ return Type.INT32;
+ case BOOLEAN:
+ return Type.BOOLEAN;
+ case BINARY:
+ return Type.BYTE_ARRAY;
+ case FLOAT:
+ return Type.FLOAT;
+ case DOUBLE:
+ return Type.DOUBLE;
+ case INT96:
+ return Type.INT96;
+ case FIXED_LEN_BYTE_ARRAY:
+ return Type.FIXED_LEN_BYTE_ARRAY;
+ default:
+ throw new RuntimeException("Unknown primitive type " + type);
+ }
+ }
+
+ /** Convert Parquet Algorithm enum to Thrift Algorithm enum */
+ public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm(
+ org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) {
+ if (parquetAlgo == null) {
+ return null;
+ }
+ EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue());
+ if (thriftAlgo == null) {
+ throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo);
+ }
+ return thriftAlgo;
+ }
+
+ /** Convert Thrift Algorithm enum to Parquet Algorithm enum */
+ public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm(
+ EdgeInterpolationAlgorithm thriftAlgo) {
+ if (thriftAlgo == null) {
+ return null;
+ }
+ return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue());
+ }
+
+ // Visible for testing
+ static FieldRepetitionType toParquetRepetition(org.apache.parquet.schema.Type.Repetition repetition) {
+ return FieldRepetitionType.valueOf(repetition.name());
+ }
+
+ // Visible for testing
+ static org.apache.parquet.schema.Type.Repetition fromParquetRepetition(FieldRepetitionType repetition) {
+ return org.apache.parquet.schema.Type.Repetition.valueOf(repetition.name());
+ }
+
+ static LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) {
+ switch (unit.getSetField()) {
+ case MICROS:
+ return LogicalTypeAnnotation.TimeUnit.MICROS;
+ case MILLIS:
+ return LogicalTypeAnnotation.TimeUnit.MILLIS;
+ case NANOS:
+ return LogicalTypeAnnotation.TimeUnit.NANOS;
+ default:
+ throw new RuntimeException("Unknown time unit " + unit);
+ }
+ }
+
+ static OriginalType toParquetOriginalType(ConvertedType convertedType) {
+ return OriginalType.valueOf(convertedType.name());
+ }
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java
new file mode 100644
index 0000000000..9ebf28f452
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/schema/converters/ParquetSchemaConverter.java
@@ -0,0 +1,551 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.schema.converters;
+
+import static java.util.Optional.empty;
+import static java.util.Optional.of;
+import static org.apache.parquet.schema.converters.ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm;
+import static org.apache.parquet.schema.converters.ParquetEnumConverter.getPrimitive;
+import static org.apache.parquet.schema.converters.ParquetEnumConverter.getType;
+import static org.apache.parquet.schema.converters.ParquetEnumConverter.toParquetOriginalType;
+import static org.apache.parquet.schema.converters.ParquetEnumConverter.toParquetRepetition;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Optional;
+import org.apache.parquet.format.ColumnOrder;
+import org.apache.parquet.format.ConvertedType;
+import org.apache.parquet.format.DecimalType;
+import org.apache.parquet.format.GeographyType;
+import org.apache.parquet.format.GeometryType;
+import org.apache.parquet.format.IntType;
+import org.apache.parquet.format.LogicalType;
+import org.apache.parquet.format.LogicalTypes;
+import org.apache.parquet.format.MicroSeconds;
+import org.apache.parquet.format.MilliSeconds;
+import org.apache.parquet.format.NanoSeconds;
+import org.apache.parquet.format.SchemaElement;
+import org.apache.parquet.format.TimeType;
+import org.apache.parquet.format.TimeUnit;
+import org.apache.parquet.format.TimestampType;
+import org.apache.parquet.format.Type;
+import org.apache.parquet.format.TypeDefinedOrder;
+import org.apache.parquet.format.VariantType;
+import org.apache.parquet.schema.DecimalMetadata;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.TypeVisitor;
+import org.apache.parquet.schema.Types;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class converts between the thrift based {@link org.apache.parquet.format.SchemaElement} class and the
+ * parquet-column {@link org.apache.parquet.column.ColumnDescriptor} classes.
+ */
+public class ParquetSchemaConverter {
+ private static final Logger LOG = LoggerFactory.getLogger(ParquetSchemaConverter.class);
+ private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder();
+ private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR =
+ new LogicalTypeConverterVisitor();
+ private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR =
+ new ConvertedTypeConverterVisitor();
+
+ public ParquetSchemaConverter() {}
+
+ /**
+ * Create parquet-column MessageType from {@link SchemaElement} and {@link ColumnOrder} thrift objects
+ *
+ * @param schema the {@link MessageType} schema
+ * @return the ordering defined for each of the columns
+ */
+ public MessageType fromParquetSchema(List schema, List columnOrders) {
+ Iterator iterator = schema.iterator();
+ SchemaElement root = iterator.next();
+ Types.MessageTypeBuilder builder = Types.buildMessage();
+ if (root.isSetField_id()) {
+ builder.id(root.field_id);
+ }
+ buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0);
+ return builder.named(root.name);
+ }
+
+ private void buildChildren(
+ Types.GroupBuilder builder,
+ Iterator schema,
+ int childrenCount,
+ List columnOrders,
+ int columnCount) {
+ for (int i = 0; i < childrenCount; i++) {
+ SchemaElement schemaElement = schema.next();
+
+ // Create Parquet Type.
+ Types.Builder childBuilder;
+ if (schemaElement.type != null) {
+ Types.PrimitiveBuilder primitiveBuilder = builder.primitive(
+ getPrimitive(schemaElement.type),
+ ParquetEnumConverter.fromParquetRepetition(schemaElement.repetition_type));
+ if (schemaElement.isSetType_length()) {
+ primitiveBuilder.length(schemaElement.type_length);
+ }
+ if (schemaElement.isSetPrecision()) {
+ primitiveBuilder.precision(schemaElement.precision);
+ }
+ if (schemaElement.isSetScale()) {
+ primitiveBuilder.scale(schemaElement.scale);
+ }
+ if (columnOrders != null) {
+ org.apache.parquet.schema.ColumnOrder columnOrder =
+ fromParquetColumnOrder(columnOrders.get(columnCount));
+ // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for
+ // the types
+ // where ordering is not supported.
+ if (columnOrder.getColumnOrderName()
+ == org.apache.parquet.schema.ColumnOrder.ColumnOrderName.TYPE_DEFINED_ORDER
+ && (schemaElement.type == Type.INT96
+ || schemaElement.converted_type == ConvertedType.INTERVAL)) {
+ columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
+ }
+ primitiveBuilder.columnOrder(columnOrder);
+ }
+ childBuilder = primitiveBuilder;
+ } else {
+ childBuilder = builder.group(ParquetEnumConverter.fromParquetRepetition(schemaElement.repetition_type));
+ buildChildren(
+ (Types.GroupBuilder) childBuilder,
+ schema,
+ schemaElement.num_children,
+ columnOrders,
+ columnCount);
+ }
+
+ if (schemaElement.isSetLogicalType()) {
+ childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType));
+ }
+ if (schemaElement.isSetConverted_type()) {
+ OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement)
+ .toOriginalType();
+ OriginalType newOriginalType = (schemaElement.isSetLogicalType()
+ && getLogicalTypeAnnotation(schemaElement.logicalType) != null)
+ ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType()
+ : null;
+ if (!originalType.equals(newOriginalType)) {
+ if (newOriginalType != null) {
+ LOG.warn(
+ "Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.",
+ schemaElement.converted_type,
+ schemaElement.logicalType);
+ }
+ childBuilder.as(originalType);
+ }
+ }
+ if (schemaElement.isSetField_id()) {
+ childBuilder.id(schemaElement.field_id);
+ }
+
+ childBuilder.named(schemaElement.name);
+ ++columnCount;
+ }
+ }
+
+ private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) {
+ if (columnOrder.isSetTYPE_ORDER()) {
+ return org.apache.parquet.schema.ColumnOrder.typeDefined();
+ }
+ // The column order is not yet supported by this API
+ return org.apache.parquet.schema.ColumnOrder.undefined();
+ }
+
+ // Visible for testing
+ LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) {
+ int scale = (schemaElement == null ? 0 : schemaElement.scale);
+ int precision = (schemaElement == null ? 0 : schemaElement.precision);
+ return LogicalTypeAnnotation.fromOriginalType(
+ toParquetOriginalType(type), new DecimalMetadata(scale, precision));
+ }
+
+ LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) {
+ switch (type.getSetField()) {
+ case MAP:
+ return LogicalTypeAnnotation.mapType();
+ case BSON:
+ return LogicalTypeAnnotation.bsonType();
+ case DATE:
+ return LogicalTypeAnnotation.dateType();
+ case ENUM:
+ return LogicalTypeAnnotation.enumType();
+ case JSON:
+ return LogicalTypeAnnotation.jsonType();
+ case LIST:
+ return LogicalTypeAnnotation.listType();
+ case TIME:
+ TimeType time = type.getTIME();
+ return LogicalTypeAnnotation.timeType(
+ time.isAdjustedToUTC, ParquetEnumConverter.convertTimeUnit(time.unit));
+ case STRING:
+ return LogicalTypeAnnotation.stringType();
+ case DECIMAL:
+ DecimalType decimal = type.getDECIMAL();
+ return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision);
+ case INTEGER:
+ IntType integer = type.getINTEGER();
+ return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned);
+ case UNKNOWN:
+ return LogicalTypeAnnotation.unknownType();
+ case TIMESTAMP:
+ TimestampType timestamp = type.getTIMESTAMP();
+ return LogicalTypeAnnotation.timestampType(
+ timestamp.isAdjustedToUTC, ParquetEnumConverter.convertTimeUnit(timestamp.unit));
+ case UUID:
+ return LogicalTypeAnnotation.uuidType();
+ case FLOAT16:
+ return LogicalTypeAnnotation.float16Type();
+ case GEOMETRY:
+ GeometryType geometry = type.getGEOMETRY();
+ return LogicalTypeAnnotation.geometryType(geometry.getCrs());
+ case GEOGRAPHY:
+ GeographyType geography = type.getGEOGRAPHY();
+ return LogicalTypeAnnotation.geographyType(
+ geography.getCrs(),
+ ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm()));
+ case VARIANT:
+ VariantType variant = type.getVARIANT();
+ return LogicalTypeAnnotation.variantType(variant.getSpecification_version());
+ default:
+ throw new RuntimeException("Unknown logical type " + type);
+ }
+ }
+
+ /**
+ * Parse parquet-column MessageType and write the {@link ColumnOrder} objects for it.
+ *
+ * @param schema the {@link MessageType} schema
+ * @return the ordering defined for each of the columns
+ */
+ public List getColumnOrders(MessageType schema) {
+ List columnOrders = new ArrayList<>();
+ // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
+ // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
+ for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
+ ColumnOrder columnOrder = new ColumnOrder();
+ columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
+ columnOrders.add(columnOrder);
+ }
+ return columnOrders;
+ }
+
+ /**
+ * Parse parquet-column MessageType and write the {@link SchemaElement} objects for it.
+ *
+ * @param schema the {@link MessageType} schema
+ * @return the {@link SchemaElement} objects for this parquet-column schema
+ */
+ public List toParquetSchema(MessageType schema) {
+ List result = new ArrayList();
+ addToList(result, schema);
+ return result;
+ }
+
+ private void addToList(final List result, org.apache.parquet.schema.Type field) {
+ field.accept(new TypeVisitor() {
+ @Override
+ public void visit(PrimitiveType primitiveType) {
+ SchemaElement element = new SchemaElement(primitiveType.getName());
+ element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition()));
+ element.setType(getType(primitiveType.getPrimitiveTypeName()));
+ if (primitiveType.getLogicalTypeAnnotation() != null) {
+ element.setConverted_type(convertToConvertedType(primitiveType.getLogicalTypeAnnotation()));
+ element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation()));
+ }
+ if (primitiveType.getDecimalMetadata() != null) {
+ element.setPrecision(primitiveType.getDecimalMetadata().getPrecision());
+ element.setScale(primitiveType.getDecimalMetadata().getScale());
+ }
+ if (primitiveType.getTypeLength() > 0) {
+ element.setType_length(primitiveType.getTypeLength());
+ }
+ if (primitiveType.getId() != null) {
+ element.setField_id(primitiveType.getId().intValue());
+ }
+ result.add(element);
+ }
+
+ @Override
+ public void visit(MessageType messageType) {
+ SchemaElement element = new SchemaElement(messageType.getName());
+ if (messageType.getId() != null) {
+ element.setField_id(messageType.getId().intValue());
+ }
+ visitChildren(result, messageType.asGroupType(), element);
+ }
+
+ @Override
+ public void visit(GroupType groupType) {
+ SchemaElement element = new SchemaElement(groupType.getName());
+ element.setRepetition_type(toParquetRepetition(groupType.getRepetition()));
+ if (groupType.getLogicalTypeAnnotation() != null) {
+ element.setConverted_type(convertToConvertedType(groupType.getLogicalTypeAnnotation()));
+ element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation()));
+ }
+ if (groupType.getId() != null) {
+ element.setField_id(groupType.getId().intValue());
+ }
+ visitChildren(result, groupType, element);
+ }
+
+ private void visitChildren(final List result, GroupType groupType, SchemaElement element) {
+ element.setNum_children(groupType.getFieldCount());
+ result.add(element);
+ for (org.apache.parquet.schema.Type field : groupType.getFields()) {
+ addToList(result, field);
+ }
+ }
+ });
+ }
+
+ LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) {
+ return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null);
+ }
+
+ ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) {
+ return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null);
+ }
+
+ static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) {
+ switch (unit) {
+ case MICROS:
+ return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds());
+ case MILLIS:
+ return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds());
+ case NANOS:
+ return TimeUnit.NANOS(new NanoSeconds());
+ default:
+ throw new RuntimeException("Unknown time unit " + unit);
+ }
+ }
+
+ private static class ConvertedTypeConverterVisitor
+ implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
+ @Override
+ public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
+ return of(ConvertedType.UTF8);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
+ return of(ConvertedType.MAP);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
+ return of(ConvertedType.LIST);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
+ return of(ConvertedType.ENUM);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
+ return of(ConvertedType.DECIMAL);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
+ return of(ConvertedType.DATE);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
+ switch (timeLogicalType.getUnit()) {
+ case MILLIS:
+ return of(ConvertedType.TIME_MILLIS);
+ case MICROS:
+ return of(ConvertedType.TIME_MICROS);
+ case NANOS:
+ return empty();
+ default:
+ throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType());
+ }
+ }
+
+ @Override
+ public Optional visit(
+ LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
+ switch (timestampLogicalType.getUnit()) {
+ case MICROS:
+ return of(ConvertedType.TIMESTAMP_MICROS);
+ case MILLIS:
+ return of(ConvertedType.TIMESTAMP_MILLIS);
+ case NANOS:
+ return empty();
+ default:
+ throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType());
+ }
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
+ boolean signed = intLogicalType.isSigned();
+ switch (intLogicalType.getBitWidth()) {
+ case 8:
+ return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8);
+ case 16:
+ return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16);
+ case 32:
+ return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32);
+ case 64:
+ return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64);
+ default:
+ throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType());
+ }
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
+ return of(ConvertedType.JSON);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
+ return of(ConvertedType.BSON);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
+ return of(ConvertedType.INTERVAL);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
+ return of(ConvertedType.MAP_KEY_VALUE);
+ }
+ }
+
+ private static class LogicalTypeConverterVisitor
+ implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
+ @Override
+ public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
+ return of(LogicalTypes.UTF8);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
+ return of(LogicalTypes.MAP);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
+ return of(LogicalTypes.LIST);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
+ return of(LogicalTypes.ENUM);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
+ return of(LogicalTypes.DECIMAL(decimalLogicalType.getScale(), decimalLogicalType.getPrecision()));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
+ return of(LogicalTypes.DATE);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
+ return of(LogicalType.TIME(
+ new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit()))));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
+ return of(LogicalType.TIMESTAMP(new TimestampType(
+ timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit()))));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
+ return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned())));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
+ return of(LogicalTypes.JSON);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
+ return of(LogicalTypes.BSON);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) {
+ return of(LogicalTypes.UUID);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) {
+ return of(LogicalTypes.FLOAT16);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.UnknownLogicalTypeAnnotation unknownLogicalType) {
+ return of(LogicalTypes.UNKNOWN);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
+ return of(LogicalTypes.UNKNOWN);
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) {
+ return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion()));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) {
+ GeometryType geometryType = new GeometryType();
+ if (geometryLogicalType.getCrs() != null
+ && !geometryLogicalType.getCrs().isEmpty()) {
+ geometryType.setCrs(geometryLogicalType.getCrs());
+ }
+ return of(LogicalType.GEOMETRY(geometryType));
+ }
+
+ @Override
+ public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) {
+ GeographyType geographyType = new GeographyType();
+ if (geographyLogicalType.getCrs() != null
+ && !geographyLogicalType.getCrs().isEmpty()) {
+ geographyType.setCrs(geographyLogicalType.getCrs());
+ }
+ geographyType.setAlgorithm(fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm()));
+ return of(LogicalType.GEOGRAPHY(geographyType));
+ }
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java
new file mode 100644
index 0000000000..5fd0d831e0
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetEnumConverter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.schema.converters;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm;
+import org.apache.parquet.format.ConvertedType;
+import org.apache.parquet.format.FieldRepetitionType;
+import org.apache.parquet.format.Type;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.junit.Test;
+
+public class TestParquetEnumConverter {
+ @Test
+ public void testEnumEquivalence() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ for (org.apache.parquet.schema.Type.Repetition repetition :
+ org.apache.parquet.schema.Type.Repetition.values()) {
+ assertEquals(
+ repetition,
+ ParquetEnumConverter.fromParquetRepetition(ParquetEnumConverter.toParquetRepetition(repetition)));
+ }
+ for (FieldRepetitionType repetition : FieldRepetitionType.values()) {
+ assertEquals(
+ repetition,
+ ParquetEnumConverter.toParquetRepetition(ParquetEnumConverter.fromParquetRepetition(repetition)));
+ }
+ for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) {
+ assertEquals(
+ primitiveTypeName,
+ ParquetEnumConverter.getPrimitive(ParquetEnumConverter.getType(primitiveTypeName)));
+ }
+ for (Type type : Type.values()) {
+ assertEquals(type, ParquetEnumConverter.getType(ParquetEnumConverter.getPrimitive(type)));
+ }
+ for (OriginalType original : OriginalType.values()) {
+ assertEquals(
+ original,
+ parquetMetadataConverter
+ .getLogicalTypeAnnotation(
+ parquetMetadataConverter.convertToConvertedType(
+ LogicalTypeAnnotation.fromOriginalType(original, null)),
+ null)
+ .toOriginalType());
+ }
+ for (ConvertedType converted : ConvertedType.values()) {
+ assertEquals(
+ converted,
+ parquetMetadataConverter.convertToConvertedType(
+ parquetMetadataConverter.getLogicalTypeAnnotation(converted, null)));
+ }
+ }
+
+ @Test
+ public void testEdgeInterpolationAlgorithmConversion() {
+ // Test conversion from Parquet to Thrift enum
+ org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL;
+ org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo =
+ ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo);
+
+ // convert the Thrift enum to the column schema enum
+ org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected =
+ org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL;
+ org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual =
+ ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo);
+ assertEquals(expected, actual);
+
+ // Test with null
+ assertNull(ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(null));
+ assertNull(ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(null));
+ }
+
+ @Test
+ public void testConvertedTypeToOriginalType() {
+ for (ConvertedType convertedType : ConvertedType.values()) {
+ assertEquals(
+ convertedType.name(),
+ ParquetEnumConverter.toParquetOriginalType(convertedType).name());
+ }
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java
new file mode 100644
index 0000000000..9b2f6edb27
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/schema/converters/TestParquetSchemaConverter.java
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.schema.converters;
+
+import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.intType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.listType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType;
+import static org.apache.parquet.schema.LogicalTypeAnnotation.variantType;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm;
+import org.apache.parquet.example.Paper;
+import org.apache.parquet.format.ConvertedType;
+import org.apache.parquet.format.DecimalType;
+import org.apache.parquet.format.FieldRepetitionType;
+import org.apache.parquet.format.GeographyType;
+import org.apache.parquet.format.GeometryType;
+import org.apache.parquet.format.LogicalType;
+import org.apache.parquet.format.MapType;
+import org.apache.parquet.format.SchemaElement;
+import org.apache.parquet.format.StringType;
+import org.apache.parquet.format.Type;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Types;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class TestParquetSchemaConverter {
+ @Rule
+ public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+ @Test
+ public void testSchemaConverter() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema);
+ MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(Paper.schema, schema);
+ }
+
+ @Test
+ public void testSchemaConverterDecimal() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ List schemaElements = parquetMetadataConverter.toParquetSchema(Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(OriginalType.DECIMAL)
+ .precision(9)
+ .scale(2)
+ .named("aBinaryDecimal")
+ .optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
+ .length(4)
+ .as(OriginalType.DECIMAL)
+ .precision(9)
+ .scale(2)
+ .named("aFixedDecimal")
+ .named("Message"));
+ List expected = List.of(
+ new SchemaElement("Message").setNum_children(2),
+ new SchemaElement("aBinaryDecimal")
+ .setRepetition_type(FieldRepetitionType.REQUIRED)
+ .setType(org.apache.parquet.format.Type.BYTE_ARRAY)
+ .setConverted_type(ConvertedType.DECIMAL)
+ .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
+ .setPrecision(9)
+ .setScale(2),
+ new SchemaElement("aFixedDecimal")
+ .setRepetition_type(FieldRepetitionType.OPTIONAL)
+ .setType(Type.FIXED_LEN_BYTE_ARRAY)
+ .setType_length(4)
+ .setConverted_type(ConvertedType.DECIMAL)
+ .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
+ .setPrecision(9)
+ .setScale(2));
+ Assert.assertEquals(expected, schemaElements);
+ }
+
+ @Test
+ public void testLogicalTypesBackwardCompatibleWithConvertedTypes() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ MessageType expected = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(OriginalType.DECIMAL)
+ .precision(9)
+ .scale(2)
+ .named("aBinaryDecimal")
+ .named("Message");
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
+ // Set logical type field to null to test backward compatibility with files written by older API,
+ // where converted_types are written to the metadata, but logicalType is missing
+ parquetSchema.get(1).setLogicalType(null);
+ MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(expected, schema);
+ }
+
+ @Test
+ public void testIncompatibleLogicalAndConvertedTypes() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ MessageType schema = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(OriginalType.DECIMAL)
+ .precision(9)
+ .scale(2)
+ .named("aBinary")
+ .named("Message");
+ MessageType expected = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.jsonType())
+ .named("aBinary")
+ .named("Message");
+
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
+ // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
+ parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
+ MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(expected, actual);
+ }
+
+ @Test
+ public void testTimeLogicalTypes() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ MessageType expected = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(false, MILLIS))
+ .named("aTimestampNonUtcMillis")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(true, MILLIS))
+ .named("aTimestampUtcMillis")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(false, MICROS))
+ .named("aTimestampNonUtcMicros")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(true, MICROS))
+ .named("aTimestampUtcMicros")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(false, NANOS))
+ .named("aTimestampNonUtcNanos")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timestampType(true, NANOS))
+ .named("aTimestampUtcNanos")
+ .required(PrimitiveType.PrimitiveTypeName.INT32)
+ .as(timeType(false, MILLIS))
+ .named("aTimeNonUtcMillis")
+ .required(PrimitiveType.PrimitiveTypeName.INT32)
+ .as(timeType(true, MILLIS))
+ .named("aTimeUtcMillis")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timeType(false, MICROS))
+ .named("aTimeNonUtcMicros")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timeType(true, MICROS))
+ .named("aTimeUtcMicros")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timeType(false, NANOS))
+ .named("aTimeNonUtcNanos")
+ .required(PrimitiveType.PrimitiveTypeName.INT64)
+ .as(timeType(true, NANOS))
+ .named("aTimeUtcNanos")
+ .named("Message");
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
+ MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(expected, schema);
+ }
+
+ @Test
+ public void testLogicalToConvertedTypeConversion() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+
+ assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType()));
+ assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType()));
+
+ assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true)));
+ assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true)));
+ assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true)));
+ assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true)));
+ assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false)));
+ assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false)));
+ assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false)));
+ assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false)));
+ assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16)));
+
+ assertEquals(
+ ConvertedType.TIMESTAMP_MILLIS,
+ parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS)));
+ assertEquals(
+ ConvertedType.TIMESTAMP_MICROS,
+ parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS)));
+ assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS)));
+ assertEquals(
+ ConvertedType.TIMESTAMP_MILLIS,
+ parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS)));
+ assertEquals(
+ ConvertedType.TIMESTAMP_MICROS,
+ parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS)));
+ assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS)));
+
+ assertEquals(
+ ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS)));
+ assertEquals(
+ ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS)));
+ assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS)));
+ assertEquals(
+ ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS)));
+ assertEquals(
+ ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS)));
+ assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS)));
+
+ assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType()));
+
+ assertEquals(
+ ConvertedType.INTERVAL,
+ parquetMetadataConverter.convertToConvertedType(
+ LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()));
+ assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType()));
+ assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType()));
+
+ assertNull(parquetMetadataConverter.convertToConvertedType(uuidType()));
+
+ assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType()));
+ assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType()));
+ assertEquals(
+ ConvertedType.MAP_KEY_VALUE,
+ parquetMetadataConverter.convertToConvertedType(
+ LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()));
+ }
+
+ @Test
+ public void testMapLogicalType() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ MessageType expected = Types.buildMessage()
+ .requiredGroup()
+ .as(mapType())
+ .repeatedGroup()
+ .as(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(stringType())
+ .named("key")
+ .required(PrimitiveType.PrimitiveTypeName.INT32)
+ .named("value")
+ .named("key_value")
+ .named("testMap")
+ .named("Message");
+
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
+ assertEquals(5, parquetSchema.size());
+ assertEquals(new SchemaElement("Message").setNum_children(1), parquetSchema.get(0));
+ assertEquals(
+ new SchemaElement("testMap")
+ .setRepetition_type(FieldRepetitionType.REQUIRED)
+ .setNum_children(1)
+ .setConverted_type(ConvertedType.MAP)
+ .setLogicalType(LogicalType.MAP(new MapType())),
+ parquetSchema.get(1));
+ // PARQUET-1879 ensure that LogicalType is not written (null) but ConvertedType is MAP_KEY_VALUE for
+ // backwards-compatibility
+ assertEquals(
+ new SchemaElement("key_value")
+ .setRepetition_type(FieldRepetitionType.REPEATED)
+ .setNum_children(2)
+ .setConverted_type(ConvertedType.MAP_KEY_VALUE)
+ .setLogicalType(null),
+ parquetSchema.get(2));
+ assertEquals(
+ new SchemaElement("key")
+ .setType(Type.BYTE_ARRAY)
+ .setRepetition_type(FieldRepetitionType.REQUIRED)
+ .setConverted_type(ConvertedType.UTF8)
+ .setLogicalType(LogicalType.STRING(new StringType())),
+ parquetSchema.get(3));
+ assertEquals(
+ new SchemaElement("value")
+ .setType(Type.INT32)
+ .setRepetition_type(FieldRepetitionType.REQUIRED)
+ .setConverted_type(null)
+ .setLogicalType(null),
+ parquetSchema.get(4));
+
+ MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(expected, schema);
+ }
+
+ @Test
+ public void testVariantLogicalType() {
+ byte specVersion = 1;
+ MessageType expected = Types.buildMessage()
+ .requiredGroup()
+ .as(variantType(specVersion))
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .named("metadata")
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .named("value")
+ .named("v")
+ .named("example");
+
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
+ MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+ assertEquals(expected, schema);
+ LogicalTypeAnnotation logicalType = schema.getType("v").getLogicalTypeAnnotation();
+ assertEquals(LogicalTypeAnnotation.variantType(specVersion), logicalType);
+ assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) logicalType).getSpecVersion());
+ }
+
+ @Test
+ public void testGeometryLogicalType() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+
+ // Create schema with geometry type
+ MessageType schema = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.geometryType("EPSG:4326"))
+ .named("geomField")
+ .named("Message");
+
+ // Convert to parquet schema and back
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
+ MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+
+ // Verify the logical type is preserved
+ assertEquals(schema, actual);
+
+ PrimitiveType primitiveType = actual.getType("geomField").asPrimitiveType();
+ LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation();
+ assertTrue(logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation);
+ assertEquals("EPSG:4326", ((LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType).getCrs());
+ }
+
+ @Test
+ public void testGeographyLogicalType() {
+ ParquetSchemaConverter parquetMetadataConverter = new ParquetSchemaConverter();
+
+ // Create schema with geography type
+ MessageType schema = Types.buildMessage()
+ .required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(LogicalTypeAnnotation.geographyType("EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL))
+ .named("geogField")
+ .named("Message");
+
+ // Convert to parquet schema and back
+ List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
+ MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
+
+ // Verify the logical type is preserved
+ assertEquals(schema, actual);
+
+ PrimitiveType primitiveType = actual.getType("geogField").asPrimitiveType();
+ LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation();
+ assertTrue(logicalType instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation);
+
+ LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType =
+ (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) logicalType;
+ assertEquals("EPSG:4326", geographyType.getCrs());
+ assertEquals(EdgeInterpolationAlgorithm.SPHERICAL, geographyType.getAlgorithm());
+ }
+
+ @Test
+ public void testGeometryLogicalTypeWithMissingCrs() {
+ // Create a Geometry logical type without specifying CRS
+ GeometryType geometryType = new GeometryType();
+ LogicalType logicalType = new LogicalType();
+ logicalType.setGEOMETRY(geometryType);
+
+ // Convert to LogicalTypeAnnotation
+ ParquetSchemaConverter converter = new ParquetSchemaConverter();
+ LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
+
+ // Verify the annotation is created correctly
+ assertNotNull("Geometry annotation should not be null", annotation);
+ assertTrue(
+ "Should be a GeometryLogicalTypeAnnotation",
+ annotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation);
+
+ LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryAnnotation =
+ (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) annotation;
+
+ // Default behavior should use null or empty CRS
+ assertNull("CRS should be null or empty when not specified", geometryAnnotation.getCrs());
+ }
+
+ @Test
+ public void testGeographyLogicalTypeWithMissingParameters() {
+ ParquetSchemaConverter converter = new ParquetSchemaConverter();
+
+ // Create a Geography logical type without CRS and algorithm
+ GeographyType geographyType = new GeographyType();
+ LogicalType logicalType = new LogicalType();
+ logicalType.setGEOGRAPHY(geographyType);
+
+ // Convert to LogicalTypeAnnotation
+ LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
+
+ // Verify the annotation is created correctly
+ assertNotNull("Geography annotation should not be null", annotation);
+ assertTrue(
+ "Should be a GeographyLogicalTypeAnnotation",
+ annotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation);
+
+ // Check that optional parameters are handled correctly
+ LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation =
+ (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation;
+ assertNull("CRS should be null when not specified", geographyAnnotation.getCrs());
+ // Most implementations default to LINEAR when algorithm is not specified
+ assertNull("Algorithm should be null when not specified", geographyAnnotation.getAlgorithm());
+
+ // Now test the round-trip conversion
+ LogicalType roundTripType = converter.convertToLogicalType(annotation);
+ assertEquals("setField should be GEOGRAPHY", LogicalType._Fields.GEOGRAPHY, roundTripType.getSetField());
+ assertNull(
+ "Round trip CRS should still be null",
+ roundTripType.getGEOGRAPHY().getCrs());
+ assertNull(
+ "Round trip Algorithm should be null",
+ roundTripType.getGEOGRAPHY().getAlgorithm());
+ }
+
+ @Test
+ public void testGeographyLogicalTypeWithAlgorithmButNoCrs() {
+ // Create a Geography logical type with algorithm but no CRS
+ GeographyType geographyType = new GeographyType();
+ geographyType.setAlgorithm(org.apache.parquet.format.EdgeInterpolationAlgorithm.SPHERICAL);
+ LogicalType logicalType = new LogicalType();
+ logicalType.setGEOGRAPHY(geographyType);
+
+ // Convert to LogicalTypeAnnotation
+ ParquetSchemaConverter converter = new ParquetSchemaConverter();
+ LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
+
+ // Verify the annotation is created correctly
+ Assert.assertNotNull("Geography annotation should not be null", annotation);
+ LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation =
+ (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation;
+
+ // CRS should be null/empty but algorithm should be set
+ assertNull("CRS should be null or empty", geographyAnnotation.getCrs());
+ assertEquals(
+ "Algorithm should be SPHERICAL",
+ EdgeInterpolationAlgorithm.SPHERICAL,
+ geographyAnnotation.getAlgorithm());
+ }
+}
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 002028cdf5..b3a62abcfa 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -18,7 +18,6 @@
*/
package org.apache.parquet.format.converter;
-import static java.util.Optional.empty;
import static java.util.Optional.of;
import static org.apache.parquet.format.Util.readColumnMetaData;
import static org.apache.parquet.format.Util.readFileMetaData;
@@ -36,7 +35,6 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -71,45 +69,27 @@
import org.apache.parquet.format.ColumnCryptoMetaData;
import org.apache.parquet.format.ColumnIndex;
import org.apache.parquet.format.ColumnMetaData;
-import org.apache.parquet.format.ColumnOrder;
import org.apache.parquet.format.CompressionCodec;
-import org.apache.parquet.format.ConvertedType;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
-import org.apache.parquet.format.DecimalType;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.EdgeInterpolationAlgorithm;
import org.apache.parquet.format.Encoding;
import org.apache.parquet.format.EncryptionWithColumnKey;
-import org.apache.parquet.format.FieldRepetitionType;
import org.apache.parquet.format.FileMetaData;
-import org.apache.parquet.format.GeographyType;
-import org.apache.parquet.format.GeometryType;
import org.apache.parquet.format.GeospatialStatistics;
-import org.apache.parquet.format.IntType;
import org.apache.parquet.format.KeyValue;
-import org.apache.parquet.format.LogicalType;
-import org.apache.parquet.format.LogicalTypes;
-import org.apache.parquet.format.MicroSeconds;
-import org.apache.parquet.format.MilliSeconds;
-import org.apache.parquet.format.NanoSeconds;
import org.apache.parquet.format.OffsetIndex;
import org.apache.parquet.format.PageEncodingStats;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.PageLocation;
import org.apache.parquet.format.PageType;
import org.apache.parquet.format.RowGroup;
-import org.apache.parquet.format.SchemaElement;
import org.apache.parquet.format.SizeStatistics;
import org.apache.parquet.format.SplitBlockAlgorithm;
import org.apache.parquet.format.Statistics;
-import org.apache.parquet.format.TimeType;
-import org.apache.parquet.format.TimeUnit;
-import org.apache.parquet.format.TimestampType;
import org.apache.parquet.format.Type;
-import org.apache.parquet.format.TypeDefinedOrder;
import org.apache.parquet.format.Uncompressed;
-import org.apache.parquet.format.VariantType;
import org.apache.parquet.format.XxHash;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
@@ -125,16 +105,14 @@
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.ColumnOrder.ColumnOrderName;
-import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;
-import org.apache.parquet.schema.TypeVisitor;
-import org.apache.parquet.schema.Types;
+import org.apache.parquet.schema.converters.ParquetEnumConverter;
+import org.apache.parquet.schema.converters.ParquetSchemaConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -142,7 +120,6 @@
// TODO: Lets split it up: https://github.com/apache/parquet-java/issues/1835
public class ParquetMetadataConverter {
- private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder();
public static final MetadataFilter NO_FILTER = new NoFilter();
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k
@@ -157,10 +134,7 @@ public class ParquetMetadataConverter {
private static final int DEFAULT_MAX_MESSAGE_SIZE = 104857600; // 100 MB
private static final Logger LOG = LoggerFactory.getLogger(ParquetMetadataConverter.class);
- private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR =
- new LogicalTypeConverterVisitor();
- private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR =
- new ConvertedTypeConverterVisitor();
+ private final ParquetSchemaConverter parquetSchemaConverter = new ParquetSchemaConverter();
private final int statisticsTruncateLength;
private final boolean useSignedStringMinMax;
private final ParquetReadOptions options;
@@ -258,7 +232,8 @@ public FileMetaData toParquetMetadata(
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
- toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
+ parquetSchemaConverter.toParquetSchema(
+ parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
@@ -270,314 +245,12 @@ public FileMetaData toParquetMetadata(
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
- fileMetaData.setColumn_orders(
- getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));
+ fileMetaData.setColumn_orders(parquetSchemaConverter.getColumnOrders(
+ parquetMetadata.getFileMetaData().getSchema()));
return fileMetaData;
}
- private List getColumnOrders(MessageType schema) {
- List columnOrders = new ArrayList<>();
- // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
- // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
- for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
- ColumnOrder columnOrder = new ColumnOrder();
- columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
- columnOrders.add(columnOrder);
- }
- return columnOrders;
- }
-
- // Visible for testing
- List toParquetSchema(MessageType schema) {
- List result = new ArrayList();
- addToList(result, schema);
- return result;
- }
-
- private void addToList(final List result, org.apache.parquet.schema.Type field) {
- field.accept(new TypeVisitor() {
- @Override
- public void visit(PrimitiveType primitiveType) {
- SchemaElement element = new SchemaElement(primitiveType.getName());
- element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition()));
- element.setType(getType(primitiveType.getPrimitiveTypeName()));
- if (primitiveType.getLogicalTypeAnnotation() != null) {
- element.setConverted_type(convertToConvertedType(primitiveType.getLogicalTypeAnnotation()));
- element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation()));
- }
- if (primitiveType.getDecimalMetadata() != null) {
- element.setPrecision(primitiveType.getDecimalMetadata().getPrecision());
- element.setScale(primitiveType.getDecimalMetadata().getScale());
- }
- if (primitiveType.getTypeLength() > 0) {
- element.setType_length(primitiveType.getTypeLength());
- }
- if (primitiveType.getId() != null) {
- element.setField_id(primitiveType.getId().intValue());
- }
- result.add(element);
- }
-
- @Override
- public void visit(MessageType messageType) {
- SchemaElement element = new SchemaElement(messageType.getName());
- if (messageType.getId() != null) {
- element.setField_id(messageType.getId().intValue());
- }
- visitChildren(result, messageType.asGroupType(), element);
- }
-
- @Override
- public void visit(GroupType groupType) {
- SchemaElement element = new SchemaElement(groupType.getName());
- element.setRepetition_type(toParquetRepetition(groupType.getRepetition()));
- if (groupType.getLogicalTypeAnnotation() != null) {
- element.setConverted_type(convertToConvertedType(groupType.getLogicalTypeAnnotation()));
- element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation()));
- }
- if (groupType.getId() != null) {
- element.setField_id(groupType.getId().intValue());
- }
- visitChildren(result, groupType, element);
- }
-
- private void visitChildren(final List result, GroupType groupType, SchemaElement element) {
- element.setNum_children(groupType.getFieldCount());
- result.add(element);
- for (org.apache.parquet.schema.Type field : groupType.getFields()) {
- addToList(result, field);
- }
- }
- });
- }
-
- LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) {
- return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null);
- }
-
- ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) {
- return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null);
- }
-
- static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) {
- switch (unit) {
- case MICROS:
- return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds());
- case MILLIS:
- return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds());
- case NANOS:
- return TimeUnit.NANOS(new NanoSeconds());
- default:
- throw new RuntimeException("Unknown time unit " + unit);
- }
- }
-
- private static class ConvertedTypeConverterVisitor
- implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
- @Override
- public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
- return of(ConvertedType.UTF8);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
- return of(ConvertedType.MAP);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
- return of(ConvertedType.LIST);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
- return of(ConvertedType.ENUM);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
- return of(ConvertedType.DECIMAL);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
- return of(ConvertedType.DATE);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
- switch (timeLogicalType.getUnit()) {
- case MILLIS:
- return of(ConvertedType.TIME_MILLIS);
- case MICROS:
- return of(ConvertedType.TIME_MICROS);
- case NANOS:
- return empty();
- default:
- throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType());
- }
- }
-
- @Override
- public Optional visit(
- LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
- switch (timestampLogicalType.getUnit()) {
- case MICROS:
- return of(ConvertedType.TIMESTAMP_MICROS);
- case MILLIS:
- return of(ConvertedType.TIMESTAMP_MILLIS);
- case NANOS:
- return empty();
- default:
- throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType());
- }
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
- boolean signed = intLogicalType.isSigned();
- switch (intLogicalType.getBitWidth()) {
- case 8:
- return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8);
- case 16:
- return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16);
- case 32:
- return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32);
- case 64:
- return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64);
- default:
- throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType());
- }
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
- return of(ConvertedType.JSON);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
- return of(ConvertedType.BSON);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
- return of(ConvertedType.INTERVAL);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
- return of(ConvertedType.MAP_KEY_VALUE);
- }
- }
-
- private static class LogicalTypeConverterVisitor
- implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
- @Override
- public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
- return of(LogicalTypes.UTF8);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
- return of(LogicalTypes.MAP);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
- return of(LogicalTypes.LIST);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
- return of(LogicalTypes.ENUM);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
- return of(LogicalTypes.DECIMAL(decimalLogicalType.getScale(), decimalLogicalType.getPrecision()));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
- return of(LogicalTypes.DATE);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
- return of(LogicalType.TIME(
- new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit()))));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
- return of(LogicalType.TIMESTAMP(new TimestampType(
- timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit()))));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
- return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned())));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
- return of(LogicalTypes.JSON);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
- return of(LogicalTypes.BSON);
- }
-
- @Override
- public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) {
- return of(LogicalTypes.UUID);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) {
- return of(LogicalTypes.FLOAT16);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.UnknownLogicalTypeAnnotation unknownLogicalType) {
- return of(LogicalTypes.UNKNOWN);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
- return of(LogicalTypes.UNKNOWN);
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.VariantLogicalTypeAnnotation variantLogicalType) {
- return of(LogicalTypes.VARIANT(variantLogicalType.getSpecVersion()));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) {
- GeometryType geometryType = new GeometryType();
- if (geometryLogicalType.getCrs() != null
- && !geometryLogicalType.getCrs().isEmpty()) {
- geometryType.setCrs(geometryLogicalType.getCrs());
- }
- return of(LogicalType.GEOMETRY(geometryType));
- }
-
- @Override
- public Optional visit(LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyLogicalType) {
- GeographyType geographyType = new GeographyType();
- if (geographyLogicalType.getCrs() != null
- && !geographyLogicalType.getCrs().isEmpty()) {
- geographyType.setCrs(geographyLogicalType.getCrs());
- }
- geographyType.setAlgorithm(fromParquetEdgeInterpolationAlgorithm(geographyLogicalType.getAlgorithm()));
- return of(LogicalType.GEOGRAPHY(geographyType));
- }
- }
-
private void addRowGroup(
ParquetMetadata parquetMetadata,
List rowGroups,
@@ -606,7 +279,7 @@ private void addRowGroup(
encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup);
}
ColumnMetaData metaData = new ColumnMetaData(
- getType(columnMetaData.getType()),
+ ParquetEnumConverter.getType(columnMetaData.getType()),
toFormatEncodings(columnMetaData.getEncodings()),
columnMetaData.getPath().toList(),
toFormatCodec(columnMetaData.getCodec()),
@@ -1214,168 +887,12 @@ public Optional visit(
return defaultSortOrder(primitive.getPrimitiveTypeName());
}
+ /**
+ * @deprecated Please use {@link ParquetEnumConverter#getPrimitive(Type)} instead.
+ */
+ @Deprecated
public PrimitiveTypeName getPrimitive(Type type) {
- switch (type) {
- case BYTE_ARRAY: // TODO: rename BINARY and remove this switch
- return PrimitiveTypeName.BINARY;
- case INT64:
- return PrimitiveTypeName.INT64;
- case INT32:
- return PrimitiveTypeName.INT32;
- case BOOLEAN:
- return PrimitiveTypeName.BOOLEAN;
- case FLOAT:
- return PrimitiveTypeName.FLOAT;
- case DOUBLE:
- return PrimitiveTypeName.DOUBLE;
- case INT96:
- return PrimitiveTypeName.INT96;
- case FIXED_LEN_BYTE_ARRAY:
- return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
- default:
- throw new RuntimeException("Unknown type " + type);
- }
- }
-
- // Visible for testing
- Type getType(PrimitiveTypeName type) {
- switch (type) {
- case INT64:
- return Type.INT64;
- case INT32:
- return Type.INT32;
- case BOOLEAN:
- return Type.BOOLEAN;
- case BINARY:
- return Type.BYTE_ARRAY;
- case FLOAT:
- return Type.FLOAT;
- case DOUBLE:
- return Type.DOUBLE;
- case INT96:
- return Type.INT96;
- case FIXED_LEN_BYTE_ARRAY:
- return Type.FIXED_LEN_BYTE_ARRAY;
- default:
- throw new RuntimeException("Unknown primitive type " + type);
- }
- }
-
- // Visible for testing
- LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) {
- switch (type) {
- case UTF8:
- return LogicalTypeAnnotation.stringType();
- case MAP:
- return LogicalTypeAnnotation.mapType();
- case MAP_KEY_VALUE:
- return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance();
- case LIST:
- return LogicalTypeAnnotation.listType();
- case ENUM:
- return LogicalTypeAnnotation.enumType();
- case DECIMAL:
- int scale = (schemaElement == null ? 0 : schemaElement.scale);
- int precision = (schemaElement == null ? 0 : schemaElement.precision);
- return LogicalTypeAnnotation.decimalType(scale, precision);
- case DATE:
- return LogicalTypeAnnotation.dateType();
- case TIME_MILLIS:
- return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
- case TIME_MICROS:
- return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
- case TIMESTAMP_MILLIS:
- return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
- case TIMESTAMP_MICROS:
- return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
- case INTERVAL:
- return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance();
- case INT_8:
- return LogicalTypeAnnotation.intType(8, true);
- case INT_16:
- return LogicalTypeAnnotation.intType(16, true);
- case INT_32:
- return LogicalTypeAnnotation.intType(32, true);
- case INT_64:
- return LogicalTypeAnnotation.intType(64, true);
- case UINT_8:
- return LogicalTypeAnnotation.intType(8, false);
- case UINT_16:
- return LogicalTypeAnnotation.intType(16, false);
- case UINT_32:
- return LogicalTypeAnnotation.intType(32, false);
- case UINT_64:
- return LogicalTypeAnnotation.intType(64, false);
- case JSON:
- return LogicalTypeAnnotation.jsonType();
- case BSON:
- return LogicalTypeAnnotation.bsonType();
- default:
- throw new RuntimeException(
- "Can't convert converted type to logical type, unknown converted type " + type);
- }
- }
-
- LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) {
- switch (type.getSetField()) {
- case MAP:
- return LogicalTypeAnnotation.mapType();
- case BSON:
- return LogicalTypeAnnotation.bsonType();
- case DATE:
- return LogicalTypeAnnotation.dateType();
- case ENUM:
- return LogicalTypeAnnotation.enumType();
- case JSON:
- return LogicalTypeAnnotation.jsonType();
- case LIST:
- return LogicalTypeAnnotation.listType();
- case TIME:
- TimeType time = type.getTIME();
- return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit));
- case STRING:
- return LogicalTypeAnnotation.stringType();
- case DECIMAL:
- DecimalType decimal = type.getDECIMAL();
- return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision);
- case INTEGER:
- IntType integer = type.getINTEGER();
- return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned);
- case UNKNOWN:
- return LogicalTypeAnnotation.unknownType();
- case TIMESTAMP:
- TimestampType timestamp = type.getTIMESTAMP();
- return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit));
- case UUID:
- return LogicalTypeAnnotation.uuidType();
- case FLOAT16:
- return LogicalTypeAnnotation.float16Type();
- case GEOMETRY:
- GeometryType geometry = type.getGEOMETRY();
- return LogicalTypeAnnotation.geometryType(geometry.getCrs());
- case GEOGRAPHY:
- GeographyType geography = type.getGEOGRAPHY();
- return LogicalTypeAnnotation.geographyType(
- geography.getCrs(), toParquetEdgeInterpolationAlgorithm(geography.getAlgorithm()));
- case VARIANT:
- VariantType variant = type.getVARIANT();
- return LogicalTypeAnnotation.variantType(variant.getSpecification_version());
- default:
- throw new RuntimeException("Unknown logical type " + type);
- }
- }
-
- private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) {
- switch (unit.getSetField()) {
- case MICROS:
- return LogicalTypeAnnotation.TimeUnit.MICROS;
- case MILLIS:
- return LogicalTypeAnnotation.TimeUnit.MILLIS;
- case NANOS:
- return LogicalTypeAnnotation.TimeUnit.NANOS;
- default:
- throw new RuntimeException("Unknown time unit " + unit);
- }
+ return ParquetEnumConverter.getPrimitive(type);
}
private static void addKeyValue(FileMetaData fileMetaData, String key, String value) {
@@ -1824,7 +1341,8 @@ public ParquetMetadata fromParquetMetadata(
boolean encryptedFooter,
Map rowGroupToRowIndexOffsetMap)
throws IOException {
- MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
+ MessageType messageType = parquetSchemaConverter.fromParquetSchema(
+ parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
List blocks = new ArrayList();
List row_groups = parquetMetadata.getRow_groups();
@@ -1987,111 +1505,6 @@ private static ColumnPath getPath(ColumnMetaData metaData) {
}
// Visible for testing
- MessageType fromParquetSchema(List schema, List columnOrders) {
- Iterator iterator = schema.iterator();
- SchemaElement root = iterator.next();
- Types.MessageTypeBuilder builder = Types.buildMessage();
- if (root.isSetField_id()) {
- builder.id(root.field_id);
- }
- buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0);
- return builder.named(root.name);
- }
-
- private void buildChildren(
- Types.GroupBuilder builder,
- Iterator schema,
- int childrenCount,
- List columnOrders,
- int columnCount) {
- for (int i = 0; i < childrenCount; i++) {
- SchemaElement schemaElement = schema.next();
-
- // Create Parquet Type.
- Types.Builder childBuilder;
- if (schemaElement.type != null) {
- Types.PrimitiveBuilder primitiveBuilder = builder.primitive(
- getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type));
- if (schemaElement.isSetType_length()) {
- primitiveBuilder.length(schemaElement.type_length);
- }
- if (schemaElement.isSetPrecision()) {
- primitiveBuilder.precision(schemaElement.precision);
- }
- if (schemaElement.isSetScale()) {
- primitiveBuilder.scale(schemaElement.scale);
- }
- if (columnOrders != null) {
- org.apache.parquet.schema.ColumnOrder columnOrder =
- fromParquetColumnOrder(columnOrders.get(columnCount));
- // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for
- // the types
- // where ordering is not supported.
- if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER
- && (schemaElement.type == Type.INT96
- || schemaElement.converted_type == ConvertedType.INTERVAL)) {
- columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
- }
- primitiveBuilder.columnOrder(columnOrder);
- }
- childBuilder = primitiveBuilder;
- } else {
- childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type));
- buildChildren(
- (Types.GroupBuilder) childBuilder,
- schema,
- schemaElement.num_children,
- columnOrders,
- columnCount);
- }
-
- if (schemaElement.isSetLogicalType()) {
- childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType));
- }
- if (schemaElement.isSetConverted_type()) {
- OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement)
- .toOriginalType();
- OriginalType newOriginalType = (schemaElement.isSetLogicalType()
- && getLogicalTypeAnnotation(schemaElement.logicalType) != null)
- ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType()
- : null;
- if (!originalType.equals(newOriginalType)) {
- if (newOriginalType != null) {
- LOG.warn(
- "Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.",
- schemaElement.converted_type,
- schemaElement.logicalType);
- }
- childBuilder.as(originalType);
- }
- }
- if (schemaElement.isSetField_id()) {
- childBuilder.id(schemaElement.field_id);
- }
-
- childBuilder.named(schemaElement.name);
- ++columnCount;
- }
- }
-
- // Visible for testing
- FieldRepetitionType toParquetRepetition(Repetition repetition) {
- return FieldRepetitionType.valueOf(repetition.name());
- }
-
- // Visible for testing
- Repetition fromParquetRepetition(FieldRepetitionType repetition) {
- return Repetition.valueOf(repetition.name());
- }
-
- private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) {
- if (columnOrder.isSetTYPE_ORDER()) {
- return org.apache.parquet.schema.ColumnOrder.typeDefined();
- }
- // The column order is not yet supported by this API
- return org.apache.parquet.schema.ColumnOrder.undefined();
- }
-
@Deprecated
public void writeDataPageHeader(
int uncompressedSize,
@@ -2674,25 +2087,21 @@ public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.s
return formatStats;
}
- /** Convert Parquet Algorithm enum to Thrift Algorithm enum */
+ /**
+ * @deprecated Use {@link ParquetEnumConverter#fromParquetEdgeInterpolationAlgorithm(org.apache.parquet.column.schema.EdgeInterpolationAlgorithm)} instead.
+ */
+ @Deprecated
public static EdgeInterpolationAlgorithm fromParquetEdgeInterpolationAlgorithm(
org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo) {
- if (parquetAlgo == null) {
- return null;
- }
- EdgeInterpolationAlgorithm thriftAlgo = EdgeInterpolationAlgorithm.findByValue(parquetAlgo.getValue());
- if (thriftAlgo == null) {
- throw new IllegalArgumentException("Unrecognized Parquet EdgeInterpolationAlgorithm: " + parquetAlgo);
- }
- return thriftAlgo;
+ return ParquetEnumConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo);
}
- /** Convert Thrift Algorithm enum to Parquet Algorithm enum */
+ /**
+ * @deprecated Use {@link ParquetEnumConverter#toParquetEdgeInterpolationAlgorithm(EdgeInterpolationAlgorithm)} instead.
+ */
+ @Deprecated
public static org.apache.parquet.column.schema.EdgeInterpolationAlgorithm toParquetEdgeInterpolationAlgorithm(
EdgeInterpolationAlgorithm thriftAlgo) {
- if (thriftAlgo == null) {
- return null;
- }
- return org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.findByValue(thriftAlgo.getValue());
+ return ParquetEnumConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo);
}
}
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
index 2529f06ada..10e41219ee 100644
--- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java
@@ -26,22 +26,8 @@
import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByMidpoint;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.getOffset;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.NANOS;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.intType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.listType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType;
import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType;
-import static org.apache.parquet.schema.LogicalTypeAnnotation.variantType;
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -51,7 +37,6 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.io.ByteArrayInputStream;
@@ -80,7 +65,6 @@
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.ParquetProperties;
-import org.apache.parquet.column.schema.EdgeInterpolationAlgorithm;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.BooleanStatistics;
import org.apache.parquet.column.statistics.DoubleStatistics;
@@ -94,26 +78,19 @@
import org.apache.parquet.crypto.EncryptionPropertiesFactory;
import org.apache.parquet.crypto.FileDecryptionProperties;
import org.apache.parquet.crypto.InternalFileDecryptor;
-import org.apache.parquet.example.Paper;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.format.BoundingBox;
import org.apache.parquet.format.ColumnChunk;
import org.apache.parquet.format.ColumnMetaData;
import org.apache.parquet.format.ConvertedType;
-import org.apache.parquet.format.DecimalType;
import org.apache.parquet.format.FieldRepetitionType;
import org.apache.parquet.format.FileMetaData;
-import org.apache.parquet.format.GeographyType;
-import org.apache.parquet.format.GeometryType;
import org.apache.parquet.format.GeospatialStatistics;
-import org.apache.parquet.format.LogicalType;
-import org.apache.parquet.format.MapType;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.PageType;
import org.apache.parquet.format.RowGroup;
import org.apache.parquet.format.SchemaElement;
-import org.apache.parquet.format.StringType;
import org.apache.parquet.format.Type;
import org.apache.parquet.format.Util;
import org.apache.parquet.hadoop.ParquetReader;
@@ -139,6 +116,7 @@
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.Types;
+import org.apache.parquet.schema.converters.ParquetSchemaConverter;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
@@ -166,50 +144,6 @@ public void testPageHeader() throws IOException {
assertEquals(pageHeader, readPageHeader);
}
- @Test
- public void testSchemaConverter() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema);
- MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(Paper.schema, schema);
- }
-
- @Test
- public void testSchemaConverterDecimal() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- List schemaElements = parquetMetadataConverter.toParquetSchema(Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(OriginalType.DECIMAL)
- .precision(9)
- .scale(2)
- .named("aBinaryDecimal")
- .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
- .length(4)
- .as(OriginalType.DECIMAL)
- .precision(9)
- .scale(2)
- .named("aFixedDecimal")
- .named("Message"));
- List expected = Lists.newArrayList(
- new SchemaElement("Message").setNum_children(2),
- new SchemaElement("aBinaryDecimal")
- .setRepetition_type(FieldRepetitionType.REQUIRED)
- .setType(Type.BYTE_ARRAY)
- .setConverted_type(ConvertedType.DECIMAL)
- .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
- .setPrecision(9)
- .setScale(2),
- new SchemaElement("aFixedDecimal")
- .setRepetition_type(FieldRepetitionType.OPTIONAL)
- .setType(Type.FIXED_LEN_BYTE_ARRAY)
- .setType_length(4)
- .setConverted_type(ConvertedType.DECIMAL)
- .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
- .setPrecision(9)
- .setScale(2));
- Assert.assertEquals(expected, schemaElements);
- }
-
@Test
public void testParquetMetadataConverterWithDictionary() throws IOException {
ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);
@@ -318,155 +252,6 @@ public void testBloomFilterLength() throws IOException {
1024, convertedMetaData.getBlocks().get(0).getColumns().get(0).getBloomFilterLength());
}
- @Test
- public void testLogicalTypesBackwardCompatibleWithConvertedTypes() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- MessageType expected = Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(OriginalType.DECIMAL)
- .precision(9)
- .scale(2)
- .named("aBinaryDecimal")
- .named("Message");
- List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
- // Set logical type field to null to test backward compatibility with files written by older API,
- // where converted_types are written to the metadata, but logicalType is missing
- parquetSchema.get(1).setLogicalType(null);
- MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(expected, schema);
- }
-
- @Test
- public void testIncompatibleLogicalAndConvertedTypes() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- MessageType schema = Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(OriginalType.DECIMAL)
- .precision(9)
- .scale(2)
- .named("aBinary")
- .named("Message");
- MessageType expected = Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(LogicalTypeAnnotation.jsonType())
- .named("aBinary")
- .named("Message");
-
- List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
- // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
- parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
- MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(expected, actual);
- }
-
- @Test
- public void testTimeLogicalTypes() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- MessageType expected = Types.buildMessage()
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(false, MILLIS))
- .named("aTimestampNonUtcMillis")
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(true, MILLIS))
- .named("aTimestampUtcMillis")
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(false, MICROS))
- .named("aTimestampNonUtcMicros")
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(true, MICROS))
- .named("aTimestampUtcMicros")
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(false, NANOS))
- .named("aTimestampNonUtcNanos")
- .required(PrimitiveTypeName.INT64)
- .as(timestampType(true, NANOS))
- .named("aTimestampUtcNanos")
- .required(PrimitiveTypeName.INT32)
- .as(timeType(false, MILLIS))
- .named("aTimeNonUtcMillis")
- .required(PrimitiveTypeName.INT32)
- .as(timeType(true, MILLIS))
- .named("aTimeUtcMillis")
- .required(PrimitiveTypeName.INT64)
- .as(timeType(false, MICROS))
- .named("aTimeNonUtcMicros")
- .required(PrimitiveTypeName.INT64)
- .as(timeType(true, MICROS))
- .named("aTimeUtcMicros")
- .required(PrimitiveTypeName.INT64)
- .as(timeType(false, NANOS))
- .named("aTimeNonUtcNanos")
- .required(PrimitiveTypeName.INT64)
- .as(timeType(true, NANOS))
- .named("aTimeUtcNanos")
- .named("Message");
- List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
- MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(expected, schema);
- }
-
- @Test
- public void testLogicalToConvertedTypeConversion() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-
- assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType()));
- assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType()));
-
- assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true)));
- assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true)));
- assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true)));
- assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true)));
- assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false)));
- assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false)));
- assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false)));
- assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false)));
- assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16)));
-
- assertEquals(
- ConvertedType.TIMESTAMP_MILLIS,
- parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS)));
- assertEquals(
- ConvertedType.TIMESTAMP_MICROS,
- parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS)));
- assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS)));
- assertEquals(
- ConvertedType.TIMESTAMP_MILLIS,
- parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS)));
- assertEquals(
- ConvertedType.TIMESTAMP_MICROS,
- parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS)));
- assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS)));
-
- assertEquals(
- ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS)));
- assertEquals(
- ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS)));
- assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS)));
- assertEquals(
- ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS)));
- assertEquals(
- ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS)));
- assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS)));
-
- assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType()));
-
- assertEquals(
- ConvertedType.INTERVAL,
- parquetMetadataConverter.convertToConvertedType(
- LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()));
- assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType()));
- assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType()));
-
- assertNull(parquetMetadataConverter.convertToConvertedType(uuidType()));
-
- assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType()));
- assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType()));
- assertEquals(
- ConvertedType.MAP_KEY_VALUE,
- parquetMetadataConverter.convertToConvertedType(
- LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance()));
- }
-
@Test
public void testEnumEquivalence() {
ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
@@ -478,42 +263,6 @@ public void testEnumEquivalence() {
assertEquals(
encoding, parquetMetadataConverter.getEncoding(parquetMetadataConverter.getEncoding(encoding)));
}
- for (Repetition repetition : Repetition.values()) {
- assertEquals(
- repetition,
- parquetMetadataConverter.fromParquetRepetition(
- parquetMetadataConverter.toParquetRepetition(repetition)));
- }
- for (FieldRepetitionType repetition : FieldRepetitionType.values()) {
- assertEquals(
- repetition,
- parquetMetadataConverter.toParquetRepetition(
- parquetMetadataConverter.fromParquetRepetition(repetition)));
- }
- for (PrimitiveTypeName primitiveTypeName : PrimitiveTypeName.values()) {
- assertEquals(
- primitiveTypeName,
- parquetMetadataConverter.getPrimitive(parquetMetadataConverter.getType(primitiveTypeName)));
- }
- for (Type type : Type.values()) {
- assertEquals(type, parquetMetadataConverter.getType(parquetMetadataConverter.getPrimitive(type)));
- }
- for (OriginalType original : OriginalType.values()) {
- assertEquals(
- original,
- parquetMetadataConverter
- .getLogicalTypeAnnotation(
- parquetMetadataConverter.convertToConvertedType(
- LogicalTypeAnnotation.fromOriginalType(original, null)),
- null)
- .toOriginalType());
- }
- for (ConvertedType converted : ConvertedType.values()) {
- assertEquals(
- converted,
- parquetMetadataConverter.convertToConvertedType(
- parquetMetadataConverter.getLogicalTypeAnnotation(converted, null)));
- }
}
private FileMetaData metadata(long... sizes) {
@@ -1491,61 +1240,6 @@ public void testColumnIndexConversion() {
}
}
- @Test
- public void testMapLogicalType() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- MessageType expected = Types.buildMessage()
- .requiredGroup()
- .as(mapType())
- .repeatedGroup()
- .as(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())
- .required(PrimitiveTypeName.BINARY)
- .as(stringType())
- .named("key")
- .required(PrimitiveTypeName.INT32)
- .named("value")
- .named("key_value")
- .named("testMap")
- .named("Message");
-
- List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
- assertEquals(5, parquetSchema.size());
- assertEquals(new SchemaElement("Message").setNum_children(1), parquetSchema.get(0));
- assertEquals(
- new SchemaElement("testMap")
- .setRepetition_type(FieldRepetitionType.REQUIRED)
- .setNum_children(1)
- .setConverted_type(ConvertedType.MAP)
- .setLogicalType(LogicalType.MAP(new MapType())),
- parquetSchema.get(1));
- // PARQUET-1879 ensure that LogicalType is not written (null) but ConvertedType is MAP_KEY_VALUE for
- // backwards-compatibility
- assertEquals(
- new SchemaElement("key_value")
- .setRepetition_type(FieldRepetitionType.REPEATED)
- .setNum_children(2)
- .setConverted_type(ConvertedType.MAP_KEY_VALUE)
- .setLogicalType(null),
- parquetSchema.get(2));
- assertEquals(
- new SchemaElement("key")
- .setType(Type.BYTE_ARRAY)
- .setRepetition_type(FieldRepetitionType.REQUIRED)
- .setConverted_type(ConvertedType.UTF8)
- .setLogicalType(LogicalType.STRING(new StringType())),
- parquetSchema.get(3));
- assertEquals(
- new SchemaElement("value")
- .setType(Type.INT32)
- .setRepetition_type(FieldRepetitionType.REQUIRED)
- .setConverted_type(null)
- .setLogicalType(null),
- parquetSchema.get(4));
-
- MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(expected, schema);
- }
-
@Test
public void testMapLogicalTypeReadWrite() throws Exception {
MessageType messageType = Types.buildMessage()
@@ -1590,34 +1284,12 @@ public void testMapConvertedTypeReadWrite() throws Exception {
.setConverted_type(null)
.setLogicalType(null));
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- MessageType messageType = parquetMetadataConverter.fromParquetSchema(oldConvertedTypeSchemaElements, null);
+ ParquetSchemaConverter parquetSchemaConverter = new ParquetSchemaConverter();
+ MessageType messageType = parquetSchemaConverter.fromParquetSchema(oldConvertedTypeSchemaElements, null);
verifyMapMessageType(messageType, "map");
}
- @Test
- public void testVariantLogicalType() {
- byte specVersion = 1;
- MessageType expected = Types.buildMessage()
- .requiredGroup()
- .as(variantType(specVersion))
- .required(PrimitiveTypeName.BINARY)
- .named("metadata")
- .required(PrimitiveTypeName.BINARY)
- .named("value")
- .named("v")
- .named("example");
-
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
- List parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
- MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
- assertEquals(expected, schema);
- LogicalTypeAnnotation logicalType = schema.getType("v").getLogicalTypeAnnotation();
- assertEquals(LogicalTypeAnnotation.variantType(specVersion), logicalType);
- assertEquals(specVersion, ((LogicalTypeAnnotation.VariantLogicalTypeAnnotation) logicalType).getSpecVersion());
- }
-
private void verifyMapMessageType(final MessageType messageType, final String keyValueName) throws IOException {
Path file = new Path(temporaryFolder.newFolder("verifyMapMessageType").getPath(), keyValueName + ".parquet");
@@ -1668,143 +1340,6 @@ public void testSizeStatisticsConversion() {
assertEquals(defLevelHistogram, sizeStatistics.getDefinitionLevelHistogram());
}
- @Test
- public void testGeometryLogicalType() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-
- // Create schema with geometry type
- MessageType schema = Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(LogicalTypeAnnotation.geometryType("EPSG:4326"))
- .named("geomField")
- .named("Message");
-
- // Convert to parquet schema and back
- List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
- MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
-
- // Verify the logical type is preserved
- assertEquals(schema, actual);
-
- PrimitiveType primitiveType = actual.getType("geomField").asPrimitiveType();
- LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation();
- assertTrue(logicalType instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation);
- assertEquals("EPSG:4326", ((LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) logicalType).getCrs());
- }
-
- @Test
- public void testGeographyLogicalType() {
- ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
-
- // Create schema with geography type
- MessageType schema = Types.buildMessage()
- .required(PrimitiveTypeName.BINARY)
- .as(LogicalTypeAnnotation.geographyType("EPSG:4326", EdgeInterpolationAlgorithm.SPHERICAL))
- .named("geogField")
- .named("Message");
-
- // Convert to parquet schema and back
- List parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
- MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
-
- // Verify the logical type is preserved
- assertEquals(schema, actual);
-
- PrimitiveType primitiveType = actual.getType("geogField").asPrimitiveType();
- LogicalTypeAnnotation logicalType = primitiveType.getLogicalTypeAnnotation();
- assertTrue(logicalType instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation);
-
- LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyType =
- (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) logicalType;
- assertEquals("EPSG:4326", geographyType.getCrs());
- assertEquals(EdgeInterpolationAlgorithm.SPHERICAL, geographyType.getAlgorithm());
- }
-
- @Test
- public void testGeometryLogicalTypeWithMissingCrs() {
- // Create a Geometry logical type without specifying CRS
- GeometryType geometryType = new GeometryType();
- LogicalType logicalType = new LogicalType();
- logicalType.setGEOMETRY(geometryType);
-
- // Convert to LogicalTypeAnnotation
- ParquetMetadataConverter converter = new ParquetMetadataConverter();
- LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
-
- // Verify the annotation is created correctly
- assertNotNull("Geometry annotation should not be null", annotation);
- assertTrue(
- "Should be a GeometryLogicalTypeAnnotation",
- annotation instanceof LogicalTypeAnnotation.GeometryLogicalTypeAnnotation);
-
- LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryAnnotation =
- (LogicalTypeAnnotation.GeometryLogicalTypeAnnotation) annotation;
-
- // Default behavior should use null or empty CRS
- assertNull("CRS should be null or empty when not specified", geometryAnnotation.getCrs());
- }
-
- @Test
- public void testGeographyLogicalTypeWithMissingParameters() {
- ParquetMetadataConverter converter = new ParquetMetadataConverter();
-
- // Create a Geography logical type without CRS and algorithm
- GeographyType geographyType = new GeographyType();
- LogicalType logicalType = new LogicalType();
- logicalType.setGEOGRAPHY(geographyType);
-
- // Convert to LogicalTypeAnnotation
- LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
-
- // Verify the annotation is created correctly
- assertNotNull("Geography annotation should not be null", annotation);
- assertTrue(
- "Should be a GeographyLogicalTypeAnnotation",
- annotation instanceof LogicalTypeAnnotation.GeographyLogicalTypeAnnotation);
-
- // Check that optional parameters are handled correctly
- LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation =
- (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation;
- assertNull("CRS should be null when not specified", geographyAnnotation.getCrs());
- // Most implementations default to LINEAR when algorithm is not specified
- assertNull("Algorithm should be null when not specified", geographyAnnotation.getAlgorithm());
-
- // Now test the round-trip conversion
- LogicalType roundTripType = converter.convertToLogicalType(annotation);
- assertEquals("setField should be GEOGRAPHY", LogicalType._Fields.GEOGRAPHY, roundTripType.getSetField());
- assertNull(
- "Round trip CRS should still be null",
- roundTripType.getGEOGRAPHY().getCrs());
- assertNull(
- "Round trip Algorithm should be null",
- roundTripType.getGEOGRAPHY().getAlgorithm());
- }
-
- @Test
- public void testGeographyLogicalTypeWithAlgorithmButNoCrs() {
- // Create a Geography logical type with algorithm but no CRS
- GeographyType geographyType = new GeographyType();
- geographyType.setAlgorithm(org.apache.parquet.format.EdgeInterpolationAlgorithm.SPHERICAL);
- LogicalType logicalType = new LogicalType();
- logicalType.setGEOGRAPHY(geographyType);
-
- // Convert to LogicalTypeAnnotation
- ParquetMetadataConverter converter = new ParquetMetadataConverter();
- LogicalTypeAnnotation annotation = converter.getLogicalTypeAnnotation(logicalType);
-
- // Verify the annotation is created correctly
- Assert.assertNotNull("Geography annotation should not be null", annotation);
- LogicalTypeAnnotation.GeographyLogicalTypeAnnotation geographyAnnotation =
- (LogicalTypeAnnotation.GeographyLogicalTypeAnnotation) annotation;
-
- // CRS should be null/empty but algorithm should be set
- assertNull("CRS should be null or empty", geographyAnnotation.getCrs());
- assertEquals(
- "Algorithm should be SPHERICAL",
- EdgeInterpolationAlgorithm.SPHERICAL,
- geographyAnnotation.getAlgorithm());
- }
-
@Test
public void testGeospatialStatisticsConversion() {
// Create a ParquetMetadataConverter
@@ -1855,8 +1390,9 @@ public void testGeospatialStatisticsConversion() {
// Create primitive geometry type for conversion back
LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326");
- PrimitiveType geometryType =
- Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry");
+ PrimitiveType geometryType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(geometryAnnotation)
+ .named("geometry");
// Convert back from Thrift format
org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats =
@@ -1906,8 +1442,9 @@ public void testGeospatialStatisticsWithNullBoundingBox() {
// Create primitive geometry type for conversion back
LogicalTypeAnnotation geometryAnnotation = LogicalTypeAnnotation.geometryType("EPSG:4326");
- PrimitiveType geometryType =
- Types.required(PrimitiveTypeName.BINARY).as(geometryAnnotation).named("geometry");
+ PrimitiveType geometryType = Types.required(PrimitiveType.PrimitiveTypeName.BINARY)
+ .as(geometryAnnotation)
+ .named("geometry");
// Convert back from Thrift format
org.apache.parquet.column.statistics.geospatial.GeospatialStatistics convertedStats =
@@ -1942,23 +1479,4 @@ public void testInvalidBoundingBox() {
GeospatialStatistics thriftStats = converter.toParquetGeospatialStatistics(origStats);
assertNull("Should return null for invalid BoundingBox", thriftStats);
}
-
- @Test
- public void testEdgeInterpolationAlgorithmConversion() {
- // Test conversion from Parquet to Thrift enum
- org.apache.parquet.column.schema.EdgeInterpolationAlgorithm parquetAlgo = EdgeInterpolationAlgorithm.SPHERICAL;
- org.apache.parquet.format.EdgeInterpolationAlgorithm thriftAlgo =
- ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(parquetAlgo);
-
- // convert the Thrift enum to the column schema enum
- org.apache.parquet.column.schema.EdgeInterpolationAlgorithm expected =
- org.apache.parquet.column.schema.EdgeInterpolationAlgorithm.SPHERICAL;
- org.apache.parquet.column.schema.EdgeInterpolationAlgorithm actual =
- ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(thriftAlgo);
- assertEquals(expected, actual);
-
- // Test with null
- assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null));
- assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null));
- }
}