Skip to content

Commit 231e9ac

Browse files
authored
Merge pull request #17 from embulk/fix-guess-csv-timestamp-format-for-null
Fix guessing a timestamp format from null, by using SchemaGuess of embulk-util-guess:0.2.0
2 parents 3157b45 + 534d63f commit 231e9ac

File tree

6 files changed

+33
-328
lines changed

6 files changed

+33
-328
lines changed

embulk-guess-csv/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ dependencies {
2121

2222
compile project(":embulk-parser-csv")
2323
compile "com.ibm.icu:icu4j:54.1.1"
24-
compile "org.embulk:embulk-util-guess:0.1.4"
24+
compile "org.embulk:embulk-util-guess:0.2.0"
2525

2626
testImplementation "junit:junit:4.13.2"
2727
testImplementation "org.embulk:embulk-api:0.10.33"

embulk-guess-csv/gradle/dependency-locks/embulkPluginRuntime.lockfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ com.ibm.icu:icu4j:54.1.1
99
javax.validation:validation-api:1.1.0.Final
1010
org.embulk:embulk-util-config:0.3.1
1111
org.embulk:embulk-util-file:0.1.3
12-
org.embulk:embulk-util-guess:0.1.4
12+
org.embulk:embulk-util-guess:0.2.0
1313
org.embulk:embulk-util-json:0.1.1
1414
org.embulk:embulk-util-rubytime:0.3.2
1515
org.embulk:embulk-util-text:0.1.1

embulk-guess-csv/src/main/java/org/embulk/guess/csv/CsvGuessPlugin.java

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@
3838
import org.embulk.util.config.ConfigMapperFactory;
3939
import org.embulk.util.file.ListFileInput;
4040
import org.embulk.util.guess.CharsetGuess;
41+
import org.embulk.util.guess.GuesstimatedType;
4142
import org.embulk.util.guess.LineGuessHelper;
4243
import org.embulk.util.guess.NewlineGuess;
44+
import org.embulk.util.guess.SchemaGuess;
4345
import org.embulk.util.text.LineDecoder;
4446
import org.embulk.util.text.Newline;
4547
import org.slf4j.Logger;
@@ -156,19 +158,19 @@ ConfigDiff guessLines(final ConfigSource config, final List<String> sampleLines,
156158
}
157159

158160
final boolean headerLine;
159-
final List<SchemaGuess.GuessedType> columnTypes;
161+
final List<GuesstimatedType> columnTypes;
160162
if (uncommentedSampleLines.size() == 1) {
161163
// The file contains only 1 line. Assume that there are no header line.
162164
headerLine = false;
163165

164166
if (parserGuessed.has("trim_if_not_quoted")) {
165-
columnTypes = SCHEMA_GUESS.typesFromListRecords(sampleRecords.subList(0, 1));
167+
columnTypes = typesFromListRecords(sampleRecords.subList(0, 1));
166168
} else {
167169
final List<List<String>> sampleRecordsTrimmed =
168170
splitLines(parserGuessed, true, uncommentedSampleLines, delim, true, bufferAllocator);
169-
final List<SchemaGuess.GuessedType> columnTypesTrimmed = SCHEMA_GUESS.typesFromListRecords(sampleRecordsTrimmed);
171+
final List<GuesstimatedType> columnTypesTrimmed = typesFromListRecords(sampleRecordsTrimmed);
170172

171-
final List<SchemaGuess.GuessedType> columnTypesUntrimmed = SCHEMA_GUESS.typesFromListRecords(sampleRecords.subList(0, 1));
173+
final List<GuesstimatedType> columnTypesUntrimmed = typesFromListRecords(sampleRecords.subList(0, 1));
172174
if (columnTypesUntrimmed.equals(columnTypesTrimmed)) {
173175
parserGuessed.set("trim_if_not_quoted", false);
174176
columnTypes = columnTypesUntrimmed;
@@ -180,22 +182,22 @@ ConfigDiff guessLines(final ConfigSource config, final List<String> sampleLines,
180182
} else {
181183
// The file contains more than 1 line. If guessed first line's column types are all strings or boolean, and the types are
182184
// different from the other lines, assume that the first line is column names.
183-
final List<SchemaGuess.GuessedType> firstTypes = SCHEMA_GUESS.typesFromListRecords(sampleRecords.subList(0, 1));
184-
final List<SchemaGuess.GuessedType> otherTypesUntrimmed =
185-
SCHEMA_GUESS.typesFromListRecords(sampleRecords.subList(1, sampleRecords.size()));
185+
final List<GuesstimatedType> firstTypes = typesFromListRecords(sampleRecords.subList(0, 1));
186+
final List<GuesstimatedType> otherTypesUntrimmed =
187+
typesFromListRecords(sampleRecords.subList(1, sampleRecords.size()));
186188

187189
logger.debug("Types of the first line : {}", firstTypes);
188190
logger.debug("Types of the other lines (untrimmed): {}", otherTypesUntrimmed);
189191

190-
final List<SchemaGuess.GuessedType> otherTypes;
192+
final List<GuesstimatedType> otherTypes;
191193

192194
if (parserGuessed.has("trim_if_not_quoted")) {
193195
otherTypes = otherTypesUntrimmed;
194196
} else {
195197
final List<List<String>> sampleRecordsTrimmed =
196198
splitLines(parserGuessed, true, uncommentedSampleLines, delim, true, bufferAllocator);
197-
final List<SchemaGuess.GuessedType> otherTypesTrimmed =
198-
SCHEMA_GUESS.typesFromListRecords(sampleRecordsTrimmed.subList(1, sampleRecordsTrimmed.size()));
199+
final List<GuesstimatedType> otherTypesTrimmed =
200+
typesFromListRecords(sampleRecordsTrimmed.subList(1, sampleRecordsTrimmed.size()));
199201
if (otherTypesUntrimmed.equals(otherTypesTrimmed)) {
200202
parserGuessed.set("trim_if_not_quoted", false);
201203
otherTypes = otherTypesUntrimmed;
@@ -208,7 +210,7 @@ ConfigDiff guessLines(final ConfigSource config, final List<String> sampleLines,
208210
logger.debug("Types of the other lines: {}", otherTypes);
209211

210212
headerLine = ((!firstTypes.equals(otherTypes)
211-
&& firstTypes.stream().allMatch(t -> SchemaGuess.GuessedType.STRING.equals(t) || SchemaGuess.GuessedType.BOOLEAN.equals(t)))
213+
&& firstTypes.stream().allMatch(t -> GuesstimatedType.STRING.equals(t) || GuesstimatedType.BOOLEAN.equals(t)))
212214
|| guessStringHeaderLine(sampleRecords));
213215

214216
logger.debug("headerLine: {}", headerLine);
@@ -261,7 +263,7 @@ ConfigDiff guessLines(final ConfigSource config, final List<String> sampleLines,
261263
* @param type a guessed type
262264
* @return a new column config
263265
*/
264-
protected ConfigDiff newColumn(final String name, final SchemaGuess.GuessedType type) {
266+
protected ConfigDiff newColumn(final String name, final GuesstimatedType type) {
265267
final ConfigDiff column = newConfigDiff();
266268
column.set("name", name);
267269
column.set("type", type.toString());
@@ -604,41 +606,47 @@ private static byte[] joinBytes(final List<String> sampleLines, final Newline ne
604606
return data.toByteArray();
605607
}
606608

609+
@SuppressWarnings("unchecked")
610+
private static List<GuesstimatedType> typesFromListRecords(final List<List<String>> samples) {
611+
final List<? extends List<? extends Object>> sampleObjects = (List<? extends List<? extends Object>>) samples;
612+
return SCHEMA_GUESS.typesFromListRecords((List<List<Object>>) sampleObjects);
613+
}
614+
607615
private static final ConfigMapperFactory CONFIG_MAPPER_FACTORY = ConfigMapperFactory.builder().addDefaultModules().build();
608616

609-
private static List<Character> DELIMITER_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
617+
private static final List<Character> DELIMITER_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
610618
',',
611619
'\t',
612620
'|',
613621
';'
614622
));
615623

616-
private static List<Character> QUOTE_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
624+
private static final List<Character> QUOTE_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
617625
'\"',
618626
'\''
619627
));
620628

621-
private static List<String> ESCAPE_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
629+
private static final List<String> ESCAPE_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
622630
"\\",
623631
"\""
624632
));
625633

626-
private static List<String> NULL_STRING_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
634+
private static final List<String> NULL_STRING_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
627635
"null",
628636
"NULL",
629637
"#N/A",
630638
"\\N" // MySQL LOAD, Hive STORED AS TEXTFILE
631639
));
632640

633-
private static List<String> COMMENT_LINE_MARKER_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
641+
private static final List<String> COMMENT_LINE_MARKER_CANDIDATES = Collections.unmodifiableList(Arrays.asList(
634642
"#",
635643
"//"
636644
));
637645

638-
private static SchemaGuess SCHEMA_GUESS = SchemaGuess.of();
646+
private static final SchemaGuess SCHEMA_GUESS = SchemaGuess.of(CONFIG_MAPPER_FACTORY);
639647

640-
private static int MAX_SKIP_LINES = 10;
641-
private static int NO_SKIP_DETECT_LINES = 10;
648+
private static final int MAX_SKIP_LINES = 10;
649+
private static final int NO_SKIP_DETECT_LINES = 10;
642650

643651
private static final Logger logger = LoggerFactory.getLogger(CsvGuessPlugin.class);
644652
}

0 commit comments

Comments
 (0)