diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java new file mode 100644 index 0000000000..646f51c707 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import org.slf4j.helpers.MarkerIgnoringBase; +import org.slf4j.helpers.MessageFormatter; + +// CapturingLogger is a wrapper around the slf4j logger to capture CLI ourput to use with tests. +final class CapturingLogger extends MarkerIgnoringBase implements org.slf4j.Logger { + private final StringBuilder buf = new StringBuilder(); + + @Override + public String getName() { + return "CliTestLogger"; + } + + private void append(String msg) { + if (msg != null && !msg.isEmpty()) { + buf.append(msg).append('\n'); + } + } + + private void log(String fmt, Object... args) { + String message = MessageFormatter.arrayFormat(fmt, args).getMessage(); + append(message); + } + + String dump() { + return buf.toString(); + } + + // Since the CLI logic can call any console method, this is some needed delegator code to + // ensure all methods are coverted and that the test harness does not miss anything. + // Unfortunately slf4j API does not make this easy to do in a generic way, so we + // have to manually add each method. + + @Override + public boolean isTraceEnabled() { + return true; + } + + @Override + public boolean isDebugEnabled() { + return true; + } + + @Override + public boolean isInfoEnabled() { + return true; + } + + @Override + public boolean isWarnEnabled() { + return true; + } + + @Override + public boolean isErrorEnabled() { + return true; + } + + @Override + public void trace(String msg) { + append(msg); + } + + @Override + public void trace(String format, Object arg) { + log(format, arg); + } + + @Override + public void trace(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void trace(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void trace(String msg, Throwable t) { + append(msg); + } + + @Override + public void debug(String msg) { + append(msg); + } + + @Override + public void debug(String format, Object arg) { + log(format, arg); + } + + @Override + public void debug(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void debug(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void debug(String msg, Throwable t) { + append(msg); + } + + @Override + public void info(String msg) { + append(msg); + } + + @Override + public void info(String format, Object arg) { + log(format, arg); + } + + @Override + public void info(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void info(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void info(String msg, Throwable t) { + append(msg); + } + + @Override + public void warn(String msg) { + append(msg); + } + + @Override + public void warn(String format, Object arg) { + log(format, arg); + } + + @Override + public void warn(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void warn(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void warn(String msg, Throwable t) { + append(msg); + } + + @Override + public void error(String msg) { + append(msg); + } + + @Override + public void error(String format, Object arg) { + log(format, arg); + } + + @Override + public void error(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void error(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void error(String msg, Throwable t) { + append(msg); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java new file mode 100644 index 0000000000..435abe7d36 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import org.apache.hadoop.conf.Configuration; +import org.slf4j.Logger; + +public final class CliHarness { + public CliResult run(String[] args) throws Exception { + CapturingLogger logger = new CapturingLogger(); + Main main = new Main((Logger) logger); + main.setConf(new Configuration()); + int code = main.run(args); + + CliResult result = new CliResult(code, logger.dump()); + return result; + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java new file mode 100644 index 0000000000..c610341fa1 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import static org.junit.Assert.*; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; + +public final class CliResult { + public final int exitCode; + public final String text; + + CliResult(int exitCode, String text) { + this.exitCode = exitCode; + this.text = text; + } + + public CliResult ok() { + assertEquals("exit", 0, exitCode); + return this; + } + + public CliResult fails(int code) { + assertEquals("exit", code, exitCode); + return this; + } + + public CliResult outputContains(String... parts) { + for (String p : parts) assertTrue("missing: " + p, text.contains(p)); + return this; + } + + public CliResult outputNotContains(String... parts) { + for (String p : parts) assertFalse("should not contain: " + p, text.contains(p)); + return this; + } + + public CliResult lineCount(int expected) { + long cnt = 0; + for (String line : text.split("\n")) { + if (!line.trim().isEmpty()) { + cnt++; + } + } + assertEquals(expected, cnt); + return this; + } + + public CliResult matchOutputFromFile(String filePath) throws Exception { + String expected = new String(Files.readAllBytes(Paths.get(filePath)), StandardCharsets.UTF_8); + return outputContains(expected); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java new file mode 100644 index 0000000000..98c81e9d43 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import org.apache.parquet.cli.commands.ParquetFileTest; + +/** + * Base class for CLI integration tests with an API for testing command output. + * + * Developer Usage Examples: + * + * // Basic command execution and assertion + * cli("schema file.parquet") + * .ok() + * .outputContains("int32_field", "int64_field"); + * + * // Test help output + * cli("help size-stats") + * .ok() + * .matchOutputFromFile("expected-help.txt"); + * + * // Test error conditions + * cli("invalid-command") + * .fails(1) + * .outputContains("Unknown command"); + * + * // Test command with multiple arguments + * cli("size-stats parquetFile.getAbsolutePath()") + * .ok() + * .lineCount(8); + * + */ +public abstract class CliTestBase extends ParquetFileTest { + private final CliHarness harness = new CliHarness(); + + protected CliResult cli(Object... args) throws Exception { + String[] a = new String[args.length]; + for (int i = 0; i < args.length; i++) { + a[i] = String.valueOf(args[i]); + } + return harness.run(a); + } + + protected CliResult cli(String commandLine) throws Exception { + String[] args = commandLine.split("\\s+"); + return cli((Object[]) args); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java new file mode 100644 index 0000000000..1d35f80a75 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import java.io.File; +import org.junit.Test; + +public class SchemaCliTest extends CliTestBase { + + @Test + public void showsSchemaOutput() throws Exception { + File file = parquetFile(); + cli("schema " + file.getAbsolutePath()).ok().matchOutputFromFile("src/test/resources/cli-outputs/schema.txt"); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java new file mode 100644 index 0000000000..78d28d3e91 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; + +import java.io.File; +import java.io.IOException; +import java.util.Random; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.example.data.simple.SimpleGroup; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.Test; + +public class ShowSizeStatisticsCliTest extends CliTestBase { + + private final int numRecord = 10000; + + @Test + public void showSizeStatistics() throws Exception { + File file = createParquetFileWithStats(); + + cli("size-stats " + file.getAbsolutePath()) + .ok() + .matchOutputFromFile("src/test/resources/cli-outputs/size-stats.txt"); + } + + private File createParquetFileWithStats() throws IOException { + MessageType schema = new MessageType( + "schema", + new PrimitiveType(REQUIRED, INT64, "DocId"), + new PrimitiveType(REQUIRED, INT32, "CategoryId"), + new PrimitiveType(OPTIONAL, BOOLEAN, "IsActive"), + new PrimitiveType(REPEATED, FLOAT, "Prices"), + new PrimitiveType(REPEATED, BINARY, "Tags"), + new PrimitiveType(REQUIRED, BINARY, "ProductName"), + new PrimitiveType(OPTIONAL, BINARY, "Description"), + new PrimitiveType(REQUIRED, FIXED_LEN_BYTE_ARRAY, 16, "UUID")); + + Configuration conf = new Configuration(); + conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()); + + File file = new File(getTempFolder(), "test.parquet"); + String filePath = file.getAbsolutePath(); + ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(filePath)) + .withType(schema) + .withSizeStatisticsEnabled(true) + .withPageRowCountLimit(50) + .withMinRowCountForPageSizeCheck(5) + .withDictionaryEncoding(true) + .withValidation(false) + .withConf(conf); + + Random rnd = new Random(42); + try (ParquetWriter writer = builder.build()) { + for (int i = 0; i < numRecord; i++) { + SimpleGroup g = new SimpleGroup(schema); + + g.add("DocId", rnd.nextLong()); + + g.add("CategoryId", rnd.nextInt(100)); + + // Operations to generate some non null meaningful test statistics on the parquet file. + if (i % 4 != 0) { + g.add("IsActive", rnd.nextBoolean()); + } + + int priceCount = rnd.nextInt(4); + for (int p = 0; p < priceCount; p++) { + g.add("Prices", rnd.nextFloat() * 1000); + } + + String[] possibleTags = {"electronics", "bestseller", "new", "discount", "premium"}; + int tagCount = rnd.nextInt(5); + for (int t = 0; t < tagCount; t++) { + g.add("Tags", Binary.fromString(possibleTags[rnd.nextInt(possibleTags.length)])); + } + + String[] products = { + "Laptop", + "Mouse", + "Keyboard", + "Monitor", + "Headphones", + "Smartphone", + "Tablet", + "Camera", + "Printer", + "Speaker" + }; + g.add("ProductName", Binary.fromString(products[i % products.length] + "_Model_" + (i % 50))); + + if (i % 3 != 0) { + StringBuilder desc = new StringBuilder(); + desc.append("Product description for item ").append(i).append(": "); + int descLength = rnd.nextInt(200) + 50; + for (int j = 0; j < descLength; j++) { + desc.append((char) ('a' + rnd.nextInt(26))); + } + g.add("Description", Binary.fromString(desc.toString())); + } + + byte[] uuid = new byte[16]; + rnd.nextBytes(uuid); + g.add("UUID", Binary.fromConstantByteArray(uuid)); + + writer.write(g); + } + } + + return file; + } + + @Test + public void showsHelpMessage() throws Exception { + cli("help size-stats").ok().matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); + } +} diff --git a/parquet-cli/src/test/resources/cli-outputs/schema.txt b/parquet-cli/src/test/resources/cli-outputs/schema.txt new file mode 100644 index 0000000000..1856e22cf9 --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/schema.txt @@ -0,0 +1,33 @@ +{ + "type" : "record", + "name" : "schema", + "fields" : [ { + "name" : "int32_field", + "type" : "int" + }, { + "name" : "int64_field", + "type" : "long" + }, { + "name" : "float_field", + "type" : "float" + }, { + "name" : "double_field", + "type" : "double" + }, { + "name" : "binary_field", + "type" : "bytes" + }, { + "name" : "flba_field", + "type" : { + "type" : "fixed", + "name" : "flba_field", + "size" : 12 + } + }, { + "name" : "date_field", + "type" : { + "type" : "int", + "logicalType" : "date" + } + } ] +} diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt new file mode 100644 index 0000000000..39e887614b --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt @@ -0,0 +1,10 @@ +Usage: parquet [general options] size-stats [command options] + + Description: + + Print size statistics for a Parquet file + + Examples: + + # Show size statistics for a Parquet file + parquet size-stats sample.parquet diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt new file mode 100644 index 0000000000..06f261276b --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt @@ -0,0 +1,13 @@ + + +Row group 0 +-------------------------------------------------------------------------------- +column unencoded bytes rep level histogram def level histogram +[DocId] - - - +[CategoryId] - - - +[IsActive] - - - +[Prices] - [10000, 7425] - +[Tags] 152.405 kB [10000, 11931] - +[ProductName] 156.250 kB - - +[Description] 1.170 MB - - +[UUID] - - - diff --git a/pom.xml b/pom.xml index 71d0615727..36979e2294 100644 --- a/pom.xml +++ b/pom.xml @@ -507,6 +507,7 @@ thrift-${thrift.version}.tar.gz **/dependency-reduced-pom.xml **/*.rej + **/cli-outputs/**