From c74eee169a2edf69c62cc1bffecda0e8d1f3a749 Mon Sep 17 00:00:00 2001 From: arnavb Date: Sat, 30 Aug 2025 13:44:53 +0000 Subject: [PATCH 1/6] update --- .../java/org/apache/parquet/cli/Main.java | 3 +- .../cli/ShowSizeStatisticsCliTest.java | 50 +++++ .../apache/parquet/cli/commands/FileTest.java | 14 +- .../parquet/cli/testing/CapturingLogger.java | 203 ++++++++++++++++++ .../parquet/cli/testing/CliHarness.java | 35 +++ .../apache/parquet/cli/testing/CliResult.java | 72 +++++++ .../parquet/cli/testing/CliTestBase.java | 38 ++++ .../cli-outputs/size-stats-column.txt | 33 +++ .../resources/cli-outputs/size-stats-help.txt | 11 + .../test/resources/cli-outputs/size-stats.txt | 10 + pom.xml | 1 + 11 files changed, 462 insertions(+), 8 deletions(-) create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java create mode 100644 parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt create mode 100644 parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt create mode 100644 parquet-cli/src/test/resources/cli-outputs/size-stats.txt diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index e93a21e899..2154f03b44 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -88,7 +88,8 @@ public class Main extends Configured implements Tool { @VisibleForTesting final JCommander jc; - Main(Logger console) { + @VisibleForTesting + public Main(Logger console) { this.console = console; this.jc = new JCommander(this); this.help = new Help(jc, console); diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java new file mode 100644 index 0000000000..5836190caa --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import java.io.File; +import org.apache.parquet.cli.testing.CliTestBase; +import org.junit.Test; + +public class ShowSizeStatisticsCliTest extends CliTestBase { + + @Test + public void showSizeStatistics() throws Exception { + File file = parquetFile(); + + cli("size-stats " + file.getAbsolutePath()) + .ok() + .matchOutputFromFile("src/test/resources/cli-outputs/size-stats.txt"); + } + + @Test + public void showsHelpMessage() throws Exception { + cli("help size-stats") + .ok() + .matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); + } + + @Test + public void showsSchemaOutput() throws Exception { + File file = parquetFile(); + cli("schema " + file.getAbsolutePath()) + .ok() + .matchOutputFromFile("src/test/resources/cli-outputs/size-stats-column.txt"); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java index 6e031112ff..71da590b95 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java @@ -33,13 +33,13 @@ public abstract class FileTest { - static final String INT32_FIELD = "int32_field"; - static final String INT64_FIELD = "int64_field"; - static final String FLOAT_FIELD = "float_field"; - static final String DOUBLE_FIELD = "double_field"; - static final String BINARY_FIELD = "binary_field"; - static final String FIXED_LEN_BYTE_ARRAY_FIELD = "flba_field"; - static final String DATE_FIELD = "date_field"; + public static final String INT32_FIELD = "int32_field"; + public static final String INT64_FIELD = "int64_field"; + public static final String FLOAT_FIELD = "float_field"; + public static final String DOUBLE_FIELD = "double_field"; + public static final String BINARY_FIELD = "binary_field"; + public static final String FIXED_LEN_BYTE_ARRAY_FIELD = "flba_field"; + public static final String DATE_FIELD = "date_field"; static final String[] COLORS = {"RED", "BLUE", "YELLOW", "GREEN", "WHITE"}; diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java new file mode 100644 index 0000000000..a275c47a28 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.testing; + +import org.slf4j.helpers.MarkerIgnoringBase; +import org.slf4j.helpers.MessageFormatter; + +// CapturingLogger is a wrapper around the slf4j logger to capture CLI ourput to use with tests. +final class CapturingLogger extends MarkerIgnoringBase implements org.slf4j.Logger { + private final StringBuilder buf = new StringBuilder(); + + @Override + public String getName() { + return "CliTestLogger"; + } + + private void append(String msg) { + if (msg != null && !msg.isEmpty()) { + buf.append(msg).append('\n'); + } + } + + private void log(String fmt, Object... args) { + String message = MessageFormatter.arrayFormat(fmt, args).getMessage(); + append(message); + } + + String dump() { + return buf.toString(); + } + + + // Since the CLI logic can call any console method, this is some needed delegator code to + // ensure all methods are coverted and that the test harness does not miss anything. + // Unfortunately slf4j API does not make this easy to do in a generic way, so we + // have to manually add each method. + + @Override + public boolean isTraceEnabled() { + return true; + } + + @Override + public boolean isDebugEnabled() { + return true; + } + + @Override + public boolean isInfoEnabled() { + return true; + } + + @Override + public boolean isWarnEnabled() { + return true; + } + + @Override + public boolean isErrorEnabled() { + return true; + } + + @Override + public void trace(String msg) { + append(msg); + } + + @Override + public void trace(String format, Object arg) { + log(format, arg); + } + + @Override + public void trace(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void trace(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void trace(String msg, Throwable t) { + append(msg); + } + + @Override + public void debug(String msg) { + append(msg); + } + + @Override + public void debug(String format, Object arg) { + log(format, arg); + } + + @Override + public void debug(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void debug(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void debug(String msg, Throwable t) { + append(msg); + } + + @Override + public void info(String msg) { + append(msg); + } + + @Override + public void info(String format, Object arg) { + log(format, arg); + } + + @Override + public void info(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void info(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void info(String msg, Throwable t) { + append(msg); + } + + @Override + public void warn(String msg) { + append(msg); + } + + @Override + public void warn(String format, Object arg) { + log(format, arg); + } + + @Override + public void warn(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void warn(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void warn(String msg, Throwable t) { + append(msg); + } + + @Override + public void error(String msg) { + append(msg); + } + + @Override + public void error(String format, Object arg) { + log(format, arg); + } + + @Override + public void error(String format, Object arg1, Object arg2) { + log(format, arg1, arg2); + } + + @Override + public void error(String format, Object... arguments) { + log(format, arguments); + } + + @Override + public void error(String msg, Throwable t) { + append(msg); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java new file mode 100644 index 0000000000..7617151f23 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.testing; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.cli.Main; +import org.slf4j.Logger; + +public final class CliHarness { + public CliResult run(String[] args) throws Exception { + CapturingLogger logger = new CapturingLogger(); + Main main = new Main((Logger) logger); + main.setConf(new Configuration()); + int code = main.run(args); + + CliResult result = new CliResult(code, logger.dump()); + return result; + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java new file mode 100644 index 0000000000..0e86865b29 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.testing; + +import static org.junit.Assert.*; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.charset.StandardCharsets; + +public final class CliResult { + public final int exitCode; + public final String text; + + CliResult(int exitCode, String text) { + this.exitCode = exitCode; + this.text = text; + } + + public CliResult ok() { + assertEquals("exit", 0, exitCode); + return this; + } + + public CliResult fails(int code) { + assertEquals("exit", code, exitCode); + return this; + } + + public CliResult outputContains(String... parts) { + for (String p : parts) assertTrue("missing: " + p, text.contains(p)); + return this; + } + + public CliResult outputNotContains(String... parts) { + for (String p : parts) assertFalse("should not contain: " + p, text.contains(p)); + return this; + } + + public CliResult lineCount(int expected) { + long cnt = 0; + for (String line : text.split("\n")) { + if (!line.trim().isEmpty()) { + cnt++; + } + } + assertEquals(expected, cnt); + return this; + } + + public CliResult matchOutputFromFile(String filePath) throws Exception { + String expected = new String( + Files.readAllBytes(Paths.get(filePath)), + StandardCharsets.UTF_8); + return outputContains(expected); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java new file mode 100644 index 0000000000..23f3af684d --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.testing; + +import org.apache.parquet.cli.commands.ParquetFileTest; + +public abstract class CliTestBase extends ParquetFileTest { + private final CliHarness harness = new CliHarness(); + + protected CliResult cli(Object... args) throws Exception { + String[] a = new String[args.length]; + for (int i = 0; i < args.length; i++) { + a[i] = String.valueOf(args[i]); + } + return harness.run(a); + } + + protected CliResult cli(String commandLine) throws Exception { + String[] args = commandLine.split("\\s+"); + return cli((Object[]) args); + } +} diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt new file mode 100644 index 0000000000..1856e22cf9 --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt @@ -0,0 +1,33 @@ +{ + "type" : "record", + "name" : "schema", + "fields" : [ { + "name" : "int32_field", + "type" : "int" + }, { + "name" : "int64_field", + "type" : "long" + }, { + "name" : "float_field", + "type" : "float" + }, { + "name" : "double_field", + "type" : "double" + }, { + "name" : "binary_field", + "type" : "bytes" + }, { + "name" : "flba_field", + "type" : { + "type" : "fixed", + "name" : "flba_field", + "size" : 12 + } + }, { + "name" : "date_field", + "type" : { + "type" : "int", + "logicalType" : "date" + } + } ] +} diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt new file mode 100644 index 0000000000..6411fd3acf --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt @@ -0,0 +1,11 @@ + +Usage: parquet [general options] size-stats [command options] + + Description: + + Print size statistics for a Parquet file + + Examples: + + # Show size statistics for a Parquet file + parquet size-stats sample.parquet diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt new file mode 100644 index 0000000000..e882863506 --- /dev/null +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt @@ -0,0 +1,10 @@ +Row group 0 +-------------------------------------------------------------------------------- +column unencoded bytes rep level histogram def level histogram +[int32_field] - - - +[int64_field] - - - +[float_field] - - - +[double_field] - - - +[binary_field] 46 B - - +[flba_field] - - - +[date_field] - - - diff --git a/pom.xml b/pom.xml index 71d0615727..36979e2294 100644 --- a/pom.xml +++ b/pom.xml @@ -507,6 +507,7 @@ thrift-${thrift.version}.tar.gz **/dependency-reduced-pom.xml **/*.rej + **/cli-outputs/** From 04a283e17ab0d7df94239b57622639c74702ed4f Mon Sep 17 00:00:00 2001 From: arnavb Date: Sat, 30 Aug 2025 14:01:33 +0000 Subject: [PATCH 2/6] update --- .../cli/ShowSizeStatisticsCliTest.java | 4 +-- .../parquet/cli/testing/CapturingLogger.java | 1 - .../apache/parquet/cli/testing/CliResult.java | 7 +++-- .../parquet/cli/testing/CliTestBase.java | 26 +++++++++++++++++++ 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java index 5836190caa..4181b293ce 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -35,9 +35,7 @@ public void showSizeStatistics() throws Exception { @Test public void showsHelpMessage() throws Exception { - cli("help size-stats") - .ok() - .matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); + cli("help size-stats").ok().matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); } @Test diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java index a275c47a28..607420136f 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java @@ -45,7 +45,6 @@ String dump() { return buf.toString(); } - // Since the CLI logic can call any console method, this is some needed delegator code to // ensure all methods are coverted and that the test harness does not miss anything. // Unfortunately slf4j API does not make this easy to do in a generic way, so we diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java index 0e86865b29..47d26d1fb2 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java @@ -19,9 +19,10 @@ package org.apache.parquet.cli.testing; import static org.junit.Assert.*; + +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; -import java.nio.charset.StandardCharsets; public final class CliResult { public final int exitCode; @@ -64,9 +65,7 @@ public CliResult lineCount(int expected) { } public CliResult matchOutputFromFile(String filePath) throws Exception { - String expected = new String( - Files.readAllBytes(Paths.get(filePath)), - StandardCharsets.UTF_8); + String expected = new String(Files.readAllBytes(Paths.get(filePath)), StandardCharsets.UTF_8); return outputContains(expected); } } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java index 23f3af684d..285e5563f9 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java @@ -20,6 +20,32 @@ import org.apache.parquet.cli.commands.ParquetFileTest; +/** + * Base class for CLI integration tests with an API for testing command output. + * + * Developer Usage Examples: + * + * // Basic command execution and assertion + * cli("schema file.parquet") + * .ok() + * .outputContains("int32_field", "int64_field"); + * + * // Test help output + * cli("help size-stats") + * .ok() + * .matchOutputFromFile("expected-help.txt"); + * + * // Test error conditions + * cli("invalid-command") + * .fails(1) + * .outputContains("Unknown command"); + * + * // Test command with multiple arguments + * cli("size-stats parquetFile.getAbsolutePath()") + * .ok() + * .lineCount(8); + * + */ public abstract class CliTestBase extends ParquetFileTest { private final CliHarness harness = new CliHarness(); From 03f7bdf598cf7b148af6d1f82ec2b6c6d61e3d8c Mon Sep 17 00:00:00 2001 From: arnavb Date: Fri, 5 Sep 2025 04:39:02 +0000 Subject: [PATCH 3/6] update --- .../org/apache/parquet/cli/SchemaCliTest.java | 34 ++++++ .../cli/ShowSizeStatisticsCliTest.java | 111 ++++++++++++++++-- .../{size-stats-column.txt => schema.txt} | 0 .../test/resources/cli-outputs/size-stats.txt | 18 +-- 4 files changed, 147 insertions(+), 16 deletions(-) create mode 100644 parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java rename parquet-cli/src/test/resources/cli-outputs/{size-stats-column.txt => schema.txt} (100%) diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java new file mode 100644 index 0000000000..657d866d68 --- /dev/null +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import java.io.File; +import org.apache.parquet.cli.testing.CliTestBase; +import org.junit.Test; + +public class SchemaCliTest extends CliTestBase { + + @Test + public void showsSchemaOutput() throws Exception { + File file = parquetFile(); + cli("schema " + file.getAbsolutePath()) + .ok() + .matchOutputFromFile("src/test/resources/cli-outputs/schema.txt"); + } +} diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java index 4181b293ce..e3a7cdbe11 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -18,31 +18,126 @@ */ package org.apache.parquet.cli; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; + import java.io.File; +import java.io.IOException; +import java.util.Random; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.parquet.cli.testing.CliTestBase; +import org.apache.parquet.cli.testing.CliResult; +import org.apache.parquet.example.data.simple.SimpleGroup; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.ExampleParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; import org.junit.Test; public class ShowSizeStatisticsCliTest extends CliTestBase { + private final int numRecord = 10000; + @Test public void showSizeStatistics() throws Exception { - File file = parquetFile(); + File file = createParquetFileWithStats(); cli("size-stats " + file.getAbsolutePath()) .ok() .matchOutputFromFile("src/test/resources/cli-outputs/size-stats.txt"); } + private File createParquetFileWithStats() throws IOException { + MessageType schema = new MessageType( + "schema", + new PrimitiveType(REQUIRED, INT64, "DocId"), + new PrimitiveType(REQUIRED, INT32, "CategoryId"), + new PrimitiveType(OPTIONAL, BOOLEAN, "IsActive"), + new PrimitiveType(REPEATED, FLOAT, "Prices"), + new PrimitiveType(REPEATED, BINARY, "Tags"), + new PrimitiveType(REQUIRED, BINARY, "ProductName"), + new PrimitiveType(OPTIONAL, BINARY, "Description"), + new PrimitiveType(REQUIRED, FIXED_LEN_BYTE_ARRAY, 16, "UUID")); + + Configuration conf = new Configuration(); + conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()); + + File file = new File(getTempFolder(), "test.parquet"); + String filePath = file.getAbsolutePath(); + ExampleParquetWriter.Builder builder = + ExampleParquetWriter.builder(new Path(filePath)) + .withType(schema) + .withSizeStatisticsEnabled(true) + .withPageRowCountLimit(50) + .withMinRowCountForPageSizeCheck(5) + .withDictionaryEncoding(true) + .withValidation(false) + .withConf(conf); + + Random rnd = new Random(42); + try (ParquetWriter writer = builder.build()) { + for (int i = 0; i < numRecord; i++) { + SimpleGroup g = new SimpleGroup(schema); + + g.add("DocId", rnd.nextLong()); + + g.add("CategoryId", rnd.nextInt(100)); + + // Operations to generate some non null meaningful test statistics on the parquet file. + if (i % 4 != 0) { + g.add("IsActive", rnd.nextBoolean()); + } + + int priceCount = rnd.nextInt(4); + for (int p = 0; p < priceCount; p++) { + g.add("Prices", rnd.nextFloat() * 1000); + } + + String[] possibleTags = {"electronics", "bestseller", "new", "discount", "premium"}; + int tagCount = rnd.nextInt(5); + for (int t = 0; t < tagCount; t++) { + g.add("Tags", Binary.fromString(possibleTags[rnd.nextInt(possibleTags.length)])); + } + + String[] products = {"Laptop", "Mouse", "Keyboard", "Monitor", "Headphones", + "Smartphone", "Tablet", "Camera", "Printer", "Speaker"}; + g.add("ProductName", Binary.fromString(products[i % products.length] + "_Model_" + (i % 50))); + + if (i % 3 != 0) { + StringBuilder desc = new StringBuilder(); + desc.append("Product description for item ").append(i).append(": "); + int descLength = rnd.nextInt(200) + 50; + for (int j = 0; j < descLength; j++) { + desc.append((char) ('a' + rnd.nextInt(26))); + } + g.add("Description", Binary.fromString(desc.toString())); + } + + byte[] uuid = new byte[16]; + rnd.nextBytes(uuid); + g.add("UUID", Binary.fromConstantByteArray(uuid)); + + writer.write(g); + } + } + + return file; + } + @Test public void showsHelpMessage() throws Exception { cli("help size-stats").ok().matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); } - @Test - public void showsSchemaOutput() throws Exception { - File file = parquetFile(); - cli("schema " + file.getAbsolutePath()) - .ok() - .matchOutputFromFile("src/test/resources/cli-outputs/size-stats-column.txt"); - } } diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt b/parquet-cli/src/test/resources/cli-outputs/schema.txt similarity index 100% rename from parquet-cli/src/test/resources/cli-outputs/size-stats-column.txt rename to parquet-cli/src/test/resources/cli-outputs/schema.txt diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt index e882863506..7356f6a0fb 100644 --- a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt @@ -1,10 +1,12 @@ + Row group 0 -------------------------------------------------------------------------------- -column unencoded bytes rep level histogram def level histogram -[int32_field] - - - -[int64_field] - - - -[float_field] - - - -[double_field] - - - -[binary_field] 46 B - - -[flba_field] - - - -[date_field] - - - +column unencoded bytes rep level histogram def level histogram +[DocId] - - - +[CategoryId] - - - +[IsActive] - - - +[Prices] - [10000, 7425] - +[Tags] 152.405 kB [10000, 11931] - +[ProductName] 156.250 kB - - +[Description] 1.170 MB - - +[UUID] - - - From a2306935925bb07b38f20621e12b86c969c8dd85 Mon Sep 17 00:00:00 2001 From: arnavb Date: Fri, 5 Sep 2025 04:44:30 +0000 Subject: [PATCH 4/6] update --- .../org/apache/parquet/cli/SchemaCliTest.java | 4 +--- .../cli/ShowSizeStatisticsCliTest.java | 22 ++++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java index 657d866d68..6f8118bfdd 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java @@ -27,8 +27,6 @@ public class SchemaCliTest extends CliTestBase { @Test public void showsSchemaOutput() throws Exception { File file = parquetFile(); - cli("schema " + file.getAbsolutePath()) - .ok() - .matchOutputFromFile("src/test/resources/cli-outputs/schema.txt"); + cli("schema " + file.getAbsolutePath()).ok().matchOutputFromFile("src/test/resources/cli-outputs/schema.txt"); } } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java index e3a7cdbe11..7f757e541d 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -20,7 +20,6 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; @@ -35,12 +34,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.cli.testing.CliTestBase; -import org.apache.parquet.cli.testing.CliResult; import org.apache.parquet.example.data.simple.SimpleGroup; -import org.apache.parquet.io.api.Binary; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.ExampleParquetWriter; import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.junit.Test; @@ -75,8 +73,7 @@ private File createParquetFileWithStats() throws IOException { File file = new File(getTempFolder(), "test.parquet"); String filePath = file.getAbsolutePath(); - ExampleParquetWriter.Builder builder = - ExampleParquetWriter.builder(new Path(filePath)) + ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(filePath)) .withType(schema) .withSizeStatisticsEnabled(true) .withPageRowCountLimit(50) @@ -110,8 +107,18 @@ private File createParquetFileWithStats() throws IOException { g.add("Tags", Binary.fromString(possibleTags[rnd.nextInt(possibleTags.length)])); } - String[] products = {"Laptop", "Mouse", "Keyboard", "Monitor", "Headphones", - "Smartphone", "Tablet", "Camera", "Printer", "Speaker"}; + String[] products = { + "Laptop", + "Mouse", + "Keyboard", + "Monitor", + "Headphones", + "Smartphone", + "Tablet", + "Camera", + "Printer", + "Speaker" + }; g.add("ProductName", Binary.fromString(products[i % products.length] + "_Model_" + (i % 50))); if (i % 3 != 0) { @@ -139,5 +146,4 @@ private File createParquetFileWithStats() throws IOException { public void showsHelpMessage() throws Exception { cli("help size-stats").ok().matchOutputFromFile("src/test/resources/cli-outputs/size-stats-help.txt"); } - } From 7b4d4eaf7d16cf18f63494b38261bd9308a44e0f Mon Sep 17 00:00:00 2001 From: arnavb Date: Fri, 5 Sep 2025 05:14:57 +0000 Subject: [PATCH 5/6] update --- .../java/org/apache/parquet/cli/Main.java | 3 +-- .../cli/{testing => }/CapturingLogger.java | 2 +- .../parquet/cli/{testing => }/CliHarness.java | 3 +-- .../parquet/cli/{testing => }/CliResult.java | 2 +- .../cli/{testing => }/CliTestBase.java | 2 +- .../org/apache/parquet/cli/SchemaCliTest.java | 1 - .../cli/ShowSizeStatisticsCliTest.java | 1 - .../apache/parquet/cli/commands/FileTest.java | 14 +++++++------- .../test/resources/cli-outputs/size-stats.txt | 19 ++++++++++--------- 9 files changed, 22 insertions(+), 25 deletions(-) rename parquet-cli/src/test/java/org/apache/parquet/cli/{testing => }/CapturingLogger.java (99%) rename parquet-cli/src/test/java/org/apache/parquet/cli/{testing => }/CliHarness.java (94%) rename parquet-cli/src/test/java/org/apache/parquet/cli/{testing => }/CliResult.java (98%) rename parquet-cli/src/test/java/org/apache/parquet/cli/{testing => }/CliTestBase.java (98%) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java index 2154f03b44..e93a21e899 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -88,8 +88,7 @@ public class Main extends Configured implements Tool { @VisibleForTesting final JCommander jc; - @VisibleForTesting - public Main(Logger console) { + Main(Logger console) { this.console = console; this.jc = new JCommander(this); this.help = new Help(jc, console); diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java similarity index 99% rename from parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java rename to parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java index 607420136f..646f51c707 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CapturingLogger.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CapturingLogger.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.cli.testing; +package org.apache.parquet.cli; import org.slf4j.helpers.MarkerIgnoringBase; import org.slf4j.helpers.MessageFormatter; diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java similarity index 94% rename from parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java rename to parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java index 7617151f23..435abe7d36 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliHarness.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliHarness.java @@ -16,10 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.cli.testing; +package org.apache.parquet.cli; import org.apache.hadoop.conf.Configuration; -import org.apache.parquet.cli.Main; import org.slf4j.Logger; public final class CliHarness { diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java similarity index 98% rename from parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java rename to parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java index 47d26d1fb2..c610341fa1 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliResult.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliResult.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.cli.testing; +package org.apache.parquet.cli; import static org.junit.Assert.*; diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java b/parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java similarity index 98% rename from parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java rename to parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java index 285e5563f9..98c81e9d43 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/testing/CliTestBase.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/CliTestBase.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.cli.testing; +package org.apache.parquet.cli; import org.apache.parquet.cli.commands.ParquetFileTest; diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java index 6f8118bfdd..1d35f80a75 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/SchemaCliTest.java @@ -19,7 +19,6 @@ package org.apache.parquet.cli; import java.io.File; -import org.apache.parquet.cli.testing.CliTestBase; import org.junit.Test; public class SchemaCliTest extends CliTestBase { diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java index 7f757e541d..78d28d3e91 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/ShowSizeStatisticsCliTest.java @@ -33,7 +33,6 @@ import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.parquet.cli.testing.CliTestBase; import org.apache.parquet.example.data.simple.SimpleGroup; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.ExampleParquetWriter; diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java index 71da590b95..6e031112ff 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/FileTest.java @@ -33,13 +33,13 @@ public abstract class FileTest { - public static final String INT32_FIELD = "int32_field"; - public static final String INT64_FIELD = "int64_field"; - public static final String FLOAT_FIELD = "float_field"; - public static final String DOUBLE_FIELD = "double_field"; - public static final String BINARY_FIELD = "binary_field"; - public static final String FIXED_LEN_BYTE_ARRAY_FIELD = "flba_field"; - public static final String DATE_FIELD = "date_field"; + static final String INT32_FIELD = "int32_field"; + static final String INT64_FIELD = "int64_field"; + static final String FLOAT_FIELD = "float_field"; + static final String DOUBLE_FIELD = "double_field"; + static final String BINARY_FIELD = "binary_field"; + static final String FIXED_LEN_BYTE_ARRAY_FIELD = "flba_field"; + static final String DATE_FIELD = "date_field"; static final String[] COLORS = {"RED", "BLUE", "YELLOW", "GREEN", "WHITE"}; diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt index 7356f6a0fb..06f261276b 100644 --- a/parquet-cli/src/test/resources/cli-outputs/size-stats.txt +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats.txt @@ -1,12 +1,13 @@ + Row group 0 -------------------------------------------------------------------------------- -column unencoded bytes rep level histogram def level histogram -[DocId] - - - -[CategoryId] - - - -[IsActive] - - - -[Prices] - [10000, 7425] - -[Tags] 152.405 kB [10000, 11931] - -[ProductName] 156.250 kB - - -[Description] 1.170 MB - - -[UUID] - - - +column unencoded bytes rep level histogram def level histogram +[DocId] - - - +[CategoryId] - - - +[IsActive] - - - +[Prices] - [10000, 7425] - +[Tags] 152.405 kB [10000, 11931] - +[ProductName] 156.250 kB - - +[Description] 1.170 MB - - +[UUID] - - - From 57011aeb4314517d8c2373c1df680f66232ff3e3 Mon Sep 17 00:00:00 2001 From: arnavb Date: Fri, 5 Sep 2025 06:34:06 +0000 Subject: [PATCH 6/6] update --- parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt index 6411fd3acf..39e887614b 100644 --- a/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt +++ b/parquet-cli/src/test/resources/cli-outputs/size-stats-help.txt @@ -1,4 +1,3 @@ - Usage: parquet [general options] size-stats [command options] Description: