Enhancement and simplification of CSV error management

sebastian-peter · sebastian-peter · commit 921a51ee9adc · 2024-11-26T13:21:10.000+01:00
diff --git a/src/main/java/edu/ie3/datamodel/io/source/csv/CsvDataSource.java b/src/main/java/edu/ie3/datamodel/io/source/csv/CsvDataSource.java
@@ -161,41 +161,44 @@ protected Set<Path> getIndividualTimeSeriesFilePaths() {
    *     occurred
    */
   protected Map<String, String> buildFieldsToAttributes(
-      final String csvRow, final String[] headline) {
+      final String csvRow, final String[] headline) throws SourceException {
 
     TreeMap<String, String> insensitiveFieldsToAttributes =
         new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
 
-    try {
-      String[] fieldVals = parseCsvRow(csvRow, csvSep);
+    String[] fieldVals = parseCsvRow(csvRow, csvSep);
+    insensitiveFieldsToAttributes.putAll(
+        IntStream.range(0, Math.min(fieldVals.length, headline.length))
+            .boxed()
+            .collect(
+                Collectors.toMap(
+                    k -> StringUtils.snakeCaseToCamelCase(headline[k]), v -> fieldVals[v])));
 
-      if (fieldVals.length != headline.length) {
-        throw new SourceException(
-            "The size of the headline does not fit to the size of the attribute fields.\nHeadline: "
-                + String.join(", ", headline)
-                + "\nCsvRow: "
-                + csvRow.trim()
-                + ".\nPlease check:"
-                + "\n - is the csv separator in the file matching the separator provided in the constructor ('"
-                + csvSep
-                + "')"
-                + "\n - does the number of columns match the number of headline fields "
-                + "\n - are you using a valid RFC 4180 formatted csv row?");
-      }
+    if (fieldVals.length != headline.length) {
+      throw new SourceException(
+          "The size of the headline ("
+              + headline.length
+              + ") does not fit to the size of the attribute fields ("
+              + fieldVals.length
+              + ").\nHeadline: "
+              + String.join(", ", headline)
+              + "\nRow: "
+              + csvRow.trim()
+              + ".\nPlease check:"
+              + "\n - is the csv separator in the file matching the separator provided in the constructor ('"
+              + csvSep
+              + "')"
+              + "\n - does the number of columns match the number of headline fields "
+              + "\n - are you using a valid RFC 4180 formatted csv row?");
+    }
 
-      insensitiveFieldsToAttributes.putAll(
-          IntStream.range(0, headline.length)
-              .boxed()
-              .collect(
-                  Collectors.toMap(
-                      k -> StringUtils.snakeCaseToCamelCase(headline[k]), v -> fieldVals[v])));
-    } catch (SourceException e) {
-      log.error(
-          "Cannot build fields to attributes map for row '{}' with headline '{}'.",
-          csvRow.trim(),
-          String.join(",", headline),
-          e);
+    if (insensitiveFieldsToAttributes.size() != fieldVals.length) {
+      throw new SourceException(
+          "There might be duplicate headline elements.\nHeadline: "
+              + String.join(", ", headline)
+              + ".\nPlease keep in mind that headlines are case-insensitive and underscores from snake case are ignored.");
     }
+
     return insensitiveFieldsToAttributes;
   }
 
@@ -252,7 +255,7 @@ Try<Stream<Map<String, String>>, SourceException> buildStreamWithFieldsToAttribu
       // is wanted to avoid a lock on the file), but this causes a closing of the stream as well.
       // As we still want to consume the data at other places, we start a new stream instead of
       // returning the original one
-      return Success.of(csvRowFieldValueMapping(reader, headline).parallelStream());
+      return csvRowFieldValueMapping(reader, headline);
     } catch (FileNotFoundException e) {
       if (allowFileNotExisting) {
         log.warn("Unable to find file '{}': {}", filePath, e.getMessage());
@@ -282,13 +285,20 @@ private Try<Path, SourceException> getFilePath(Class<? extends Entity> entityCla
    * @param headline of the file
    * @return a list of mapping
    */
-  protected List<Map<String, String>> csvRowFieldValueMapping(
+  protected Try<Stream<Map<String, String>>, SourceException> csvRowFieldValueMapping(
       BufferedReader reader, String[] headline) {
-    return reader
-        .lines()
-        .parallel()
-        .map(csvRow -> buildFieldsToAttributes(csvRow, headline))
-        .filter(map -> !map.isEmpty())
-        .toList();
+    return Try.scanStream(
+            reader
+                .lines()
+                .parallel()
+                .map(
+                    csvRow ->
+                        Try.of(
+                            () -> buildFieldsToAttributes(csvRow, headline),
+                            SourceException.class)),
+            "Map<String, String>")
+        .transform(
+            stream -> stream.filter(map -> !map.isEmpty()),
+            e -> new SourceException("Parsing csv row failed.", e));
   }
 }
diff --git a/src/main/java/edu/ie3/datamodel/io/source/csv/CsvIdCoordinateSource.java b/src/main/java/edu/ie3/datamodel/io/source/csv/CsvIdCoordinateSource.java
@@ -179,7 +179,7 @@ public List<CoordinateDistance> findCornerPoints(
   }
 
   public int getCoordinateCount() {
-    return idToCoordinate.keySet().size();
+    return idToCoordinate.size();
   }
 
   private Collection<Point> getCoordinatesInBoundingBox(
@@ -209,7 +209,7 @@ private Collection<Point> getCoordinatesInBoundingBox(
       // is wanted to avoid a lock on the file), but this causes a closing of the stream as well.
       // As we still want to consume the data at other places, we start a new stream instead of
       // returning the original one
-      return Success.of(dataSource.csvRowFieldValueMapping(reader, headline).parallelStream());
+      return dataSource.csvRowFieldValueMapping(reader, headline);
     } catch (IOException e) {
       return Failure.of(
           new SourceException("Cannot read the file for coordinate id to coordinate mapping.", e));
diff --git a/src/main/java/edu/ie3/datamodel/io/source/csv/CsvWeatherSource.java b/src/main/java/edu/ie3/datamodel/io/source/csv/CsvWeatherSource.java
@@ -240,7 +240,7 @@ private Try<Stream<Map<String, String>>, SourceException> buildStreamWithFieldsT
       // is wanted to avoid a lock on the file), but this causes a closing of the stream as well.
       // As we still want to consume the data at other places, we start a new stream instead of
       // returning the original one
-      return Success.of(dataSource.csvRowFieldValueMapping(reader, headline).parallelStream());
+      return dataSource.csvRowFieldValueMapping(reader, headline);
     } catch (IOException e) {
       return Failure.of(
           new SourceException(
diff --git a/src/test/groovy/edu/ie3/datamodel/io/source/csv/CsvDataSourceTest.groovy b/src/test/groovy/edu/ie3/datamodel/io/source/csv/CsvDataSourceTest.groovy
@@ -5,6 +5,7 @@
  */
 package edu.ie3.datamodel.io.source.csv
 
+import edu.ie3.datamodel.exceptions.SourceException
 import edu.ie3.datamodel.io.csv.CsvIndividualTimeSeriesMetaInformation
 import edu.ie3.datamodel.io.naming.FileNamingStrategy
 import edu.ie3.datamodel.io.naming.timeseries.ColumnScheme
@@ -274,7 +275,7 @@ class CsvDataSourceTest extends Specification implements CsvTestDataMeta {
     ]
   }
 
-  def "A CsvDataSource should be able to handle several errors when the csvRow is invalid or cannot be processed"() {
+  def "A CsvDataSource should throw an exception if the headline and CSV row have different sizes"() {
     given:
     def validHeadline = [
       "uuid",
@@ -287,33 +288,42 @@ class CsvDataSourceTest extends Specification implements CsvTestDataMeta {
       "s_rated"
     ] as String[]
 
-    expect:
-    dummyCsvSource.buildFieldsToAttributes(invalidCsvRow, validHeadline) == [:]
+    when:
+    dummyCsvSource.buildFieldsToAttributes(invalidCsvRow, validHeadline)
+
+    then:
+    def exception = thrown(SourceException)
+    exception.getMessage().startsWith("The size of the headline (8) does not fit to the size of the attribute fields")
 
     where:
     invalidCsvRow                                                                          || explaination
     "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8;25.0;100.0;0.95;98.0;test_bmTypeInput;50.0;25.0" || "wrong separator"
-    "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,25.0,100.0,0.95,98.0,test_bmTypeInput"           || "too less columns"
-    "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,25.0,100.0,0.95,98.0,test_bmTypeInput,,,,"       || "too much columns"
+    "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,25.0,100.0,0.95,98.0,test_bmTypeInput"           || "too little columns"
+    "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,25.0,100.0,0.95,98.0,test_bmTypeInput,,,,"       || "too many columns"
   }
 
-  def "A CsvDataSource should be able to handle invalid headlines"() {
-    given:
-    def validCsvRow = "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,0.95,test_bmTypeInput,25.0"
 
-    expect:
-    dummyCsvSource.buildFieldsToAttributes(validCsvRow, invalidHeadline) == [:]
-
-    where:
-    invalidHeadline                                                || explaination
-    ["uuid", "cosphi_rated", "id"] as String[]                     || "headline too short"
-    [
+  def "A CsvDataSource should throw an exception if there are duplicate headlines"() {
+    given:
+    def invalidHeadline = [
       "uuid",
+      "active_power_gradient",
+      "Active_Power_Gradient",
+      "capex",
       "cosphi_rated",
+      "eta_conv",
       "id",
+      "opex",
       "s_rated",
-      "capex"
-    ] as String[] || "headline too long"
+    ] as String[]
+    def validCsvRow = "5ebd8f7e-dedb-4017-bb86-6373c4b68eb8,25.0,25.0,100.0,0.95,98.0,test_bmTypeInput,50.0,25.0"
+
+    when:
+    dummyCsvSource.buildFieldsToAttributes(validCsvRow, invalidHeadline)
+
+    then:
+    def exception = thrown(SourceException)
+    exception.getMessage().startsWith("There might be duplicate headline elements.")
   }
 
   def "The CsvDataSource is able to provide correct paths to time series files"() {