Skip to content

Commit 25879bd

Browse files
Add support for csv files as input (#152)
* Add support for csv files as input * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 943836c commit 25879bd

File tree

2 files changed

+174
-43
lines changed

2 files changed

+174
-43
lines changed

src/virtualship/utils.py

Lines changed: 66 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import warnings
23
from datetime import timedelta
34
from functools import lru_cache
@@ -42,37 +43,33 @@ def _generic_load_yaml(data: str, model: BaseModel) -> BaseModel:
4243
return model.model_validate(yaml.safe_load(data))
4344

4445

45-
def mfp_to_yaml(excel_file_path: str, yaml_output_path: str): # noqa: D417
46-
"""
47-
Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.
46+
def load_coordinates(file_path):
47+
"""Loads coordinates from a file based on its extension."""
48+
if not os.path.isfile(file_path):
49+
raise FileNotFoundError(f"File not found: {file_path}")
4850

49-
Parameters
50-
----------
51-
- excel_file_path (str): Path to the Excel file containing coordinate and instrument data.
51+
ext = os.path.splitext(file_path)[-1].lower()
5252

53-
The function:
54-
1. Reads instrument and location data from the Excel file.
55-
2. Determines the maximum depth and buffer based on the instruments present.
56-
3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
57-
4. returns the yaml information.
53+
try:
54+
if ext in [".xls", ".xlsx"]:
55+
return pd.read_excel(file_path)
5856

59-
"""
60-
# Importing Schedule and related models from expedition module
61-
from virtualship.expedition.instrument_type import InstrumentType
62-
from virtualship.expedition.schedule import Schedule
63-
from virtualship.expedition.space_time_region import (
64-
SpaceTimeRegion,
65-
SpatialRange,
66-
TimeRange,
67-
)
68-
from virtualship.expedition.waypoint import Location, Waypoint
57+
if ext == ".csv":
58+
return pd.read_csv(file_path)
59+
60+
raise ValueError(f"Unsupported file extension {ext}.")
6961

62+
except Exception as e:
63+
raise RuntimeError(
64+
"Could not read coordinates data from the provided file. "
65+
"Ensure it is either a csv or excel file."
66+
) from e
67+
68+
69+
def validate_coordinates(coordinates_data):
7070
# Expected column headers
7171
expected_columns = {"Station Type", "Name", "Latitude", "Longitude", "Instrument"}
7272

73-
# Read data from Excel
74-
coordinates_data = pd.read_excel(excel_file_path)
75-
7673
# Check if the headers match the expected ones
7774
actual_columns = set(coordinates_data.columns)
7875

@@ -104,6 +101,51 @@ def mfp_to_yaml(excel_file_path: str, yaml_output_path: str): # noqa: D417
104101
# Continue with the rest of the function after validation...
105102
coordinates_data = coordinates_data.dropna()
106103

104+
# Convert latitude and longitude to floats, replacing commas with dots
105+
# Handles case when the latitude and longitude have decimals with commas
106+
if coordinates_data["Latitude"].dtype in ["object", "string"]:
107+
coordinates_data["Latitude"] = coordinates_data["Latitude"].apply(
108+
lambda x: float(x.replace(",", "."))
109+
)
110+
111+
if coordinates_data["Longitude"].dtype in ["object", "string"]:
112+
coordinates_data["Longitude"] = coordinates_data["Longitude"].apply(
113+
lambda x: float(x.replace(",", "."))
114+
)
115+
116+
return coordinates_data
117+
118+
119+
def mfp_to_yaml(coordinates_file_path: str, yaml_output_path: str): # noqa: D417
120+
"""
121+
Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.
122+
123+
Parameters
124+
----------
125+
- excel_file_path (str): Path to the Excel file containing coordinate and instrument data.
126+
127+
The function:
128+
1. Reads instrument and location data from the Excel file.
129+
2. Determines the maximum depth and buffer based on the instruments present.
130+
3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
131+
4. returns the yaml information.
132+
133+
"""
134+
# Importing Schedule and related models from expedition module
135+
from virtualship.expedition.instrument_type import InstrumentType
136+
from virtualship.expedition.schedule import Schedule
137+
from virtualship.expedition.space_time_region import (
138+
SpaceTimeRegion,
139+
SpatialRange,
140+
TimeRange,
141+
)
142+
from virtualship.expedition.waypoint import Location, Waypoint
143+
144+
# Read data from file
145+
coordinates_data = load_coordinates(coordinates_file_path)
146+
147+
coordinates_data = validate_coordinates(coordinates_data)
148+
107149
# maximum depth (in meters), buffer (in degrees) for each instrument
108150
instrument_max_depths = {
109151
"XBT": 2000,

tests/test_mfp_to_yaml.py

Lines changed: 108 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
import pandas as pd
24
import pytest
35

@@ -11,27 +13,71 @@ def valid_mfp_data():
1113
{
1214
"Station Type": ["A", "B", "C"],
1315
"Name": ["Station1", "Station2", "Station3"],
14-
"Latitude": [30, 31, 32],
15-
"Longitude": [-44, -45, -46],
16+
"Latitude": [30.8, 31.2, 32.5],
17+
"Longitude": [-44.3, -45.1, -46.7],
1618
"Instrument": ["CTD, DRIFTER", "ARGO_FLOAT", "XBT, CTD, DRIFTER"],
1719
}
1820
)
1921

2022

23+
# Fixture for Excel file
2124
@pytest.fixture
22-
def valid_mfp_file(tmp_path):
25+
def valid_excel_mfp_file(tmp_path):
2326
path = tmp_path / "file.xlsx"
2427
valid_mfp_data().to_excel(path, index=False)
25-
yield path
28+
return path
29+
30+
31+
# Fixture for CSV file
32+
@pytest.fixture
33+
def valid_csv_mfp_file(tmp_path):
34+
path = tmp_path / "file.csv"
35+
valid_mfp_data().to_csv(path, index=False)
36+
return path
37+
38+
39+
@pytest.fixture
40+
def valid_csv_mfp_file_with_commas(tmp_path):
41+
path = tmp_path / "file.csv"
42+
valid_mfp_data().to_csv(path, decimal=",", index=False)
43+
return path
44+
45+
46+
@pytest.fixture
47+
def invalid_mfp_file(tmp_path):
48+
path = tmp_path / "file.csv"
49+
valid_mfp_data().to_csv(path, decimal=",", sep="|", index=False)
50+
51+
return path
52+
53+
54+
@pytest.fixture
55+
def unsupported_extension_mfp_file(tmp_path):
56+
path = tmp_path / "file.unsupported"
57+
valid_mfp_data().to_csv(path, index=False)
58+
59+
return path
60+
61+
62+
@pytest.fixture
63+
def nonexistent_mfp_file(tmp_path):
64+
path = tmp_path / "non_file.csv"
65+
66+
return path
67+
68+
69+
@pytest.fixture
70+
def missing_instruments_column_mfp_file(tmp_path):
71+
path = tmp_path / "file.xlsx"
72+
valid_mfp_data().drop(columns=["Instrument"]).to_excel(path, index=False)
73+
return path
2674

2775

2876
@pytest.fixture
2977
def missing_columns_mfp_file(tmp_path):
3078
path = tmp_path / "file.xlsx"
31-
valid_mfp_data().drop(columns=["Longitude", "Instrument"]).to_excel(
32-
path, index=False
33-
)
34-
yield path
79+
valid_mfp_data().drop(columns=["Longitude"]).to_excel(path, index=False)
80+
return path
3581

3682

3783
@pytest.fixture
@@ -43,8 +89,14 @@ def unexpected_header_mfp_file(tmp_path):
4389
yield path
4490

4591

46-
def test_mfp_to_yaml_success(valid_mfp_file, tmp_path):
47-
"""Test that mfp_to_yaml correctly processes a valid MFP Excel file."""
92+
@pytest.mark.parametrize(
93+
"fixture_name",
94+
["valid_excel_mfp_file", "valid_csv_mfp_file", "valid_csv_mfp_file_with_commas"],
95+
)
96+
def test_mfp_to_yaml_success(request, fixture_name, tmp_path):
97+
"""Test that mfp_to_yaml correctly processes a valid MFP file."""
98+
valid_mfp_file = request.getfixturevalue(fixture_name)
99+
48100
yaml_output_path = tmp_path / "schedule.yaml"
49101

50102
# Run function (No need to mock open() for YAML, real file is created)
@@ -66,15 +118,52 @@ def test_mfp_to_yaml_success(valid_mfp_file, tmp_path):
66118
]
67119

68120

69-
def test_mfp_to_yaml_missing_headers(missing_columns_mfp_file, tmp_path):
70-
"""Test that mfp_to_yaml raises an error when required columns are missing."""
121+
@pytest.mark.parametrize(
122+
"fixture_name,error,match",
123+
[
124+
pytest.param(
125+
"nonexistent_mfp_file",
126+
FileNotFoundError,
127+
os.path.basename("/non_file.csv"),
128+
id="FileNotFound",
129+
),
130+
pytest.param(
131+
"unsupported_extension_mfp_file",
132+
RuntimeError,
133+
"Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.",
134+
id="UnsupportedExtension",
135+
),
136+
pytest.param(
137+
"invalid_mfp_file",
138+
RuntimeError,
139+
"Could not read coordinates data from the provided file. Ensure it is either a csv or excel file.",
140+
id="InvalidFile",
141+
),
142+
pytest.param(
143+
"missing_instruments_column_mfp_file",
144+
ValueError,
145+
"Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?",
146+
id="MissingInstruments",
147+
),
148+
pytest.param(
149+
"missing_columns_mfp_file",
150+
ValueError,
151+
(
152+
r"Error: Found columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument').*?\], "
153+
r"but expected columns \[.*?('Station Type'| 'Name'| 'Latitude'| 'Instrument'| 'Longitude').*?\]."
154+
),
155+
id="MissingColumns",
156+
),
157+
],
158+
)
159+
def test_mfp_to_yaml_exceptions(request, fixture_name, error, match, tmp_path):
160+
"""Test that mfp_to_yaml raises an error when input file is not valid."""
161+
fixture = request.getfixturevalue(fixture_name)
162+
71163
yaml_output_path = tmp_path / "schedule.yaml"
72164

73-
with pytest.raises(
74-
ValueError,
75-
match="Error: Missing column 'Instrument'. Have you added this column after exporting from MFP?",
76-
):
77-
mfp_to_yaml(missing_columns_mfp_file, yaml_output_path)
165+
with pytest.raises(error, match=match):
166+
mfp_to_yaml(fixture, yaml_output_path)
78167

79168

80169
def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path):
@@ -85,12 +174,12 @@ def test_mfp_to_yaml_extra_headers(unexpected_header_mfp_file, tmp_path):
85174
mfp_to_yaml(unexpected_header_mfp_file, yaml_output_path)
86175

87176

88-
def test_mfp_to_yaml_instrument_conversion(valid_mfp_file, tmp_path):
177+
def test_mfp_to_yaml_instrument_conversion(valid_excel_mfp_file, tmp_path):
89178
"""Test that instruments are correctly converted into InstrumentType enums."""
90179
yaml_output_path = tmp_path / "schedule.yaml"
91180

92181
# Run function
93-
mfp_to_yaml(valid_mfp_file, yaml_output_path)
182+
mfp_to_yaml(valid_excel_mfp_file, yaml_output_path)
94183

95184
# Load the generated YAML
96185
data = Schedule.from_yaml(yaml_output_path)

0 commit comments

Comments
 (0)