Add support for csv files as input (#152)

diogomatoschaves · pre-commit-ci[bot] · web-flow · commit 25879bdaf995 · 2025-03-28T14:28:36.000+01:00
* Add support for csv files as input * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/src/virtualship/utils.py b/src/virtualship/utils.py
@@ -1,3 +1,4 @@
+import os
 import warnings
 from datetime import timedelta
 from functools import lru_cache
@@ -42,37 +43,33 @@ def _generic_load_yaml(data: str, model: BaseModel) -> BaseModel:
     return model.model_validate(yaml.safe_load(data))
 
 
-def mfp_to_yaml(excel_file_path: str, yaml_output_path: str):  # noqa: D417
-    """
-    Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.
+def load_coordinates(file_path):
+    """Loads coordinates from a file based on its extension."""
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
 
-    Parameters
-    ----------
-    - excel_file_path (str): Path to the Excel file containing coordinate and instrument data.
+    ext = os.path.splitext(file_path)[-1].lower()
 
-    The function:
-    1. Reads instrument and location data from the Excel file.
-    2. Determines the maximum depth and buffer based on the instruments present.
-    3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
-    4. returns the yaml information.
+    try:
+        if ext in [".xls", ".xlsx"]:
+            return pd.read_excel(file_path)
 
-    """
-    # Importing Schedule and related models from expedition module
-    from virtualship.expedition.instrument_type import InstrumentType
-    from virtualship.expedition.schedule import Schedule
-    from virtualship.expedition.space_time_region import (
-        SpaceTimeRegion,
-        SpatialRange,
-        TimeRange,
-    )
-    from virtualship.expedition.waypoint import Location, Waypoint
+        if ext == ".csv":
+            return pd.read_csv(file_path)
+
+        raise ValueError(f"Unsupported file extension {ext}.")
 
+    except Exception as e:
+        raise RuntimeError(
+            "Could not read coordinates data from the provided file. "
+            "Ensure it is either a csv or excel file."
+        ) from e
+
+
+def validate_coordinates(coordinates_data):
     # Expected column headers
     expected_columns = {"Station Type", "Name", "Latitude", "Longitude", "Instrument"}
 
-    # Read data from Excel
-    coordinates_data = pd.read_excel(excel_file_path)
-
     # Check if the headers match the expected ones
     actual_columns = set(coordinates_data.columns)
 
@@ -104,6 +101,51 @@ def mfp_to_yaml(excel_file_path: str, yaml_output_path: str):  # noqa: D417
     # Continue with the rest of the function after validation...
     coordinates_data = coordinates_data.dropna()
 
+    # Convert latitude and longitude to floats, replacing commas with dots
+    # Handles case when the latitude and longitude have decimals with commas
+    if coordinates_data["Latitude"].dtype in ["object", "string"]:
+        coordinates_data["Latitude"] = coordinates_data["Latitude"].apply(
+            lambda x: float(x.replace(",", "."))
+        )
+
+    if coordinates_data["Longitude"].dtype in ["object", "string"]:
+        coordinates_data["Longitude"] = coordinates_data["Longitude"].apply(
+            lambda x: float(x.replace(",", "."))
+        )
+
+    return coordinates_data
+
+
+def mfp_to_yaml(coordinates_file_path: str, yaml_output_path: str):  # noqa: D417
+    """
+    Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.
+
+    Parameters
+    ----------
+    - excel_file_path (str): Path to the Excel file containing coordinate and instrument data.
+
+    The function:
+    1. Reads instrument and location data from the Excel file.
+    2. Determines the maximum depth and buffer based on the instruments present.
+    3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
+    4. returns the yaml information.
+
+    """
+    # Importing Schedule and related models from expedition module
+    from virtualship.expedition.instrument_type import InstrumentType
+    from virtualship.expedition.schedule import Schedule
+    from virtualship.expedition.space_time_region import (
+        SpaceTimeRegion,
+        SpatialRange,
+        TimeRange,
+    )
+    from virtualship.expedition.waypoint import Location, Waypoint
+
+    # Read data from file
+    coordinates_data = load_coordinates(coordinates_file_path)
+
+    coordinates_data = validate_coordinates(coordinates_data)
+
     # maximum depth (in meters), buffer (in degrees) for each instrument
     instrument_max_depths = {
         "XBT": 2000,
diff --git a/tests/test_mfp_to_yaml.py b/tests/test_mfp_to_yaml.py
@@ -1,3 +1,5 @@
+import os
+
 import pandas as pd
 import pytest
 
@@ -11,27 +13,71 @@ def valid_mfp_data():
         {
             "Station Type": ["A", "B", "C"],
             "Name": ["Station1", "Station2", "Station3"],
-            "Latitude": [30, 31, 32],
-            "Longitude": [-44, -45, -46],
+            "Latitude": [30.8, 31.2, 32.5],
+            "Longitude": [-44.3, -45.1, -46.7],
             "Instrument": ["CTD, DRIFTER", "ARGO_FLOAT", "XBT, CTD, DRIFTER"],
         }
     )
 
 
+# Fixture for Excel file
 @pytest.fixture
-def valid_mfp_file(tmp_path):
+def valid_excel_mfp_file(tmp_path):
     path = tmp_path / "file.xlsx"
     valid_mfp_data().to_excel(path, index=False)
-    yield path
+    return path
+
+
+# Fixture for CSV file
+@pytest.fixture
+def valid_csv_mfp_file(tmp_path):
+    path = tmp_path / "file.csv"
+    valid_mfp_data().to_csv(path, index=False)
+    return path
+
+
+@pytest.fixture
+def valid_csv_mfp_file_with_commas(tmp_path):
+    path = tmp_path / "file.csv"
+    valid_mfp_data().to_csv(path, decimal=",", index=False)
+    return path
+
+
+@pytest.fixture
+def invalid_mfp_file(tmp_path):
+    path = tmp_path / "file.csv"
+    valid_mfp_data().to_csv(path, decimal=",", sep="|", index=False)
+
+    return path
+
+
+@pytest.fixture
+def unsupported_extension_mfp_file(tmp_path):
+    path = tmp_path / "file.unsupported"
+    valid_mfp_data().to_csv(path, index=False)
+
+    return path
+
+
+@pytest.fixture
+def nonexistent_mfp_file(tmp_path):
+    path = tmp_path / "non_file.csv"
+
+    return path
+
+
+@pytest.fixture
+def missing_instruments_column_mfp_file(tmp_path):
+    path = tmp_path / "file.xlsx"
+    valid_mfp_data().drop(columns=["Instrument"]).to_excel(path, index=False)
+    return path
 
 
 @pytest.fixture
 def missing_columns_mfp_file(tmp_path):
     path = tmp_path / "file.xlsx"
-    valid_mfp_data().drop(columns=["Longitude", "Instrument"]).to_excel(
-        path, index=False
-    )
-    yield path
+    valid_mfp_data().drop(columns=["Longitude"]).to_excel(path, index=False)
+    return path
 
 
 @pytest.fixture
@@ -43,8 +89,14 @@ def unexpected_header_mfp_file(tmp_path):
     yield path
 
 
-def test_mfp_to_yaml_success(valid_mfp_file, tmp_path):
-    """Test that mfp_to_yaml correctly processes a valid MFP Excel file."""
+@pytest.mark.parametrize(
+    "fixture_name",
+    ["valid_excel_mfp_file", "valid_csv_mfp_file", "valid_csv_mfp_file_with_commas"],
+)
+def test_mfp_to_yaml_success(request, fixture_name, tmp_path):
+    """Test that mfp_to_yaml correctly processes a valid MFP file."""
+    valid_mfp_file = request.getfixturevalue(fixture_name)
+
     yaml_output_path = tmp_path / "schedule.yaml"
 
     # Run function (No need to mock open() for YAML, real file is created)
@@ -66,15 +118,52 @@ def test_mfp_to_yaml_success(valid_mfp_file, tmp_path):
     ]
 
 
-def test_mfp_to_yaml_missing_headers(missing_columns_mfp_file, tmp_path):
-    """Test that mfp_to_yaml raises an error when required columns are missing."""
+@pytest.mark.parametrize(
+    "fixture_name,error,match",
+    [
+        pytest.param(
+            "nonexistent_mfp_file",
+            FileNotFoundError,
+            os.path.basename("/non_file.csv"),
+            id="FileNotFound",
+        ),
+        pytest.param(
+            "unsupported_extension_mfp_file",
+            RuntimeError,
+            "Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.",
+            id="UnsupportedExtension",
+        ),
+        pytest.param(
+            "invalid_mfp_file",
+            RuntimeError,
+            "Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.",
+            id="InvalidFile",
+        ),
+        pytest.param(
+            "missing_instruments_column_mfp_file",
+            ValueError,
+            "Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?",
+            id="MissingInstruments",
+        ),
+        pytest.param(
+            "missing_columns_mfp_file",
+            ValueError,
+            (
+                r"Error: Found columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument').*?\], "
+                r"but expected columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument'| 'Longitude').*?\]."
+            ),
+            id="MissingColumns",
+        ),
+    ],
+)
+def test_mfp_to_yaml_exceptions(request, fixture_name, error, match, tmp_path):
+    """Test that mfp_to_yaml raises an error when input file is not valid."""
+    fixture = request.getfixturevalue(fixture_name)
+
     yaml_output_path = tmp_path / "schedule.yaml"
 
-    with pytest.raises(
-        ValueError,
-        match="Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?",
-    ):
-        mfp_to_yaml(missing_columns_mfp_file, yaml_output_path)
+    with pytest.raises(error, match=match):
+        mfp_to_yaml(fixture, yaml_output_path)
 
 
 def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path):
@@ -85,12 +174,12 @@ def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path):
         mfp_to_yaml(unexpected_header_mfp_file, yaml_output_path)
 
 
-def test_mfp_to_yaml_instrument_conversion(valid_mfp_file, tmp_path):
+def test_mfp_to_yaml_instrument_conversion(valid_excel_mfp_file, tmp_path):
     """Test that instruments are correctly converted into InstrumentType enums."""
     yaml_output_path = tmp_path / "schedule.yaml"
 
     # Run function
-    mfp_to_yaml(valid_mfp_file, yaml_output_path)
+    mfp_to_yaml(valid_excel_mfp_file, yaml_output_path)
 
     # Load the generated YAML
     data = Schedule.from_yaml(yaml_output_path)