diff --git a/docs/user_guide/01_Reading_data.md b/docs/user_guide/01_Reading_data.md index 71651eeca..e0732cbd1 100644 --- a/docs/user_guide/01_Reading_data.md +++ b/docs/user_guide/01_Reading_data.md @@ -139,27 +139,36 @@ shp = utilities.read_file(input="/path/to/boxes_shapefile.shp") shp.head() ``` -If your shapefile does not include an `image_path` column, you must provide the raster path via `img_path`: +##### Reading Shapefiles Without `image_path` or `label` Columns + +Many GIS shapefiles do not include `image_path` or `label` columns. You can provide these values directly to `read_file`: ```python from deepforest import utilities +# Shapefile doesn't have image_path or label columns shp = utilities.read_file( - input="/path/to/boxes_shapefile.shp", - image_path="/path/to/OSBS_029.tif" + input="/path/to/annotations.shp", + image_path="my_raster.tif", # Required if shapefile has no image_path column + label="Tree", # Optional, defaults to "Unknown" + root_dir="/path/to/images/" # Required when using image_path argument ) ``` -If your shapefile also lacks a `label` column, you can assign one for all rows: +**Arguments:** -```python -from deepforest import utilities +| Argument | Required? | Description | +|----------|-----------|-------------| +| `image_path` | **Required** if shapefile lacks `image_path` column | The image file path (relative to `root_dir`) that all annotations belong to | +| `label` | Optional | The label for all annotations. Defaults to `"Unknown"` if not provided | +| `root_dir` | **Required** when using `image_path` argument | Directory where image files are located | -shp = utilities.read_file( - input="/path/to/boxes_shapefile.shp", - image_path="/path/to/OSBS_029.tif", - label="Tree" -) +This assigns the same `image_path` and `label` to all annotations in the file. Use this when all annotations belong to a single image and share the same label. + +**Note:** A warning will be shown when `image_path` is provided but the shapefile doesn't have an `image_path` column: + +``` +UserWarning: You have passed an image_path argument, but the shapefile does not contain an image_path column. All annotations will be assigned to my_raster.tif. Make sure all annotations in the shapefile relate to this image. ``` Example output: diff --git a/src/deepforest/utilities.py b/src/deepforest/utilities.py index f467bbc6c..0cf97970e 100644 --- a/src/deepforest/utilities.py +++ b/src/deepforest/utilities.py @@ -260,6 +260,13 @@ def __assign_image_path__(gdf, image_path: str) -> str: ) gdf["image_path"] = image_path else: + warnings.warn( + f"You have passed an image_path argument, but the shapefile does not contain an image_path column. " + f"All annotations will be assigned to {image_path}. " + f"Make sure all annotations in the shapefile relate to this image.", + UserWarning, + stacklevel=2, + ) gdf["image_path"] = image_path return gdf @@ -498,9 +505,8 @@ def __check_and_assign_label__( ): if label is None: if "label" not in df.columns: - raise ValueError( - "No label specified and no label column found in dataframe, please specify label in label argument: read_file(input=df, label='YourLabel', ...)" - ) + # Default to "Unknown" if label is not provided and not in dataframe + df["label"] = "Unknown" else: if "label" in df.columns: existing_labels = df.label.unique() @@ -514,6 +520,7 @@ def __check_and_assign_label__( f"Label {existing_labels[0]} found in dataframe, overriding and assigning {label} to all rows!", stacklevel=2, ) + df["label"] = label else: df["label"] = label diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 0be8a665b..94bb9dc73 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -770,3 +770,116 @@ def test_read_file_column_names(): assert "image_path" in result.columns assert "label" in result.columns assert hasattr(result, "root_dir") + + +@pytest.fixture +def sample_shapefile_gdf(): + """Create a sample GeoDataFrame for shapefile testing.""" + sample_geometry = [geometry.Point(404211.9 + 10, 3285102 + 20), + geometry.Point(404211.9 + 20, 3285102 + 20)] + df = pd.DataFrame({"geometry": sample_geometry}) + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:32617") + gdf["geometry"] = [geometry.box(left, bottom, right, top) for left, bottom, right, top + in gdf.geometry.buffer(0.5).bounds.values] + return gdf + + +def test_read_file_shapefile_with_image_path_argument(tmp_path, sample_shapefile_gdf): + """Test reading a shapefile without image_path column by passing image_path argument. + + This tests the fix for issue #997. + """ + # Create shapefile without image_path column + gdf = sample_shapefile_gdf.copy() + gdf["label"] = "Tree" + shp_path = tmp_path / "no_image_path.shp" + gdf.to_file(str(shp_path)) + + # Read with image_path argument + image_path_full = get_data("OSBS_029.tif") + image_path = os.path.basename(image_path_full) + root_dir = os.path.dirname(image_path_full) + + with pytest.warns(UserWarning, match="You have passed an image_path argument"): + result = utilities.read_file( + input=str(shp_path), + image_path=image_path, + root_dir=root_dir + ) + + assert result.shape[0] == 2 + assert "image_path" in result.columns + assert result["image_path"].iloc[0] == image_path + + +def test_read_file_shapefile_with_label_argument(tmp_path, sample_shapefile_gdf): + """Test reading a shapefile without label column by passing label argument. + + This tests the fix for issue #997. + """ + # Create shapefile without label column + gdf = sample_shapefile_gdf.copy() + image_path_full = get_data("OSBS_029.tif") + gdf["image_path"] = os.path.basename(image_path_full) + shp_path = tmp_path / "no_label.shp" + gdf.to_file(str(shp_path)) + + # Read with label argument - no warning expected when shapefile has no label column + root_dir = os.path.dirname(image_path_full) + result = utilities.read_file( + input=str(shp_path), + label="CustomTree", + root_dir=root_dir + ) + + assert result.shape[0] == 2 + assert "label" in result.columns + assert result["label"].iloc[0] == "CustomTree" + + +def test_read_file_shapefile_with_image_path_and_label_arguments(tmp_path, sample_shapefile_gdf): + """Test reading a shapefile without image_path and label columns. + + This tests the fix for issue #997 where users can pass both arguments. + """ + # Create shapefile without image_path and label columns + gdf = sample_shapefile_gdf.copy() + shp_path = tmp_path / "no_image_path_no_label.shp" + gdf.to_file(str(shp_path)) + + # Read with both image_path and label arguments + image_path_full = get_data("OSBS_029.tif") + image_path = os.path.basename(image_path_full) + root_dir = os.path.dirname(image_path_full) + + with pytest.warns(UserWarning, match="You have passed an image_path argument"): + result = utilities.read_file( + input=str(shp_path), + image_path=image_path, + label="Tree", + root_dir=root_dir + ) + + assert result.shape[0] == 2 + assert "image_path" in result.columns + assert "label" in result.columns + assert result["image_path"].iloc[0] == image_path + assert result["label"].iloc[0] == "Tree" + + +def test_read_file_shapefile_without_image_path_raises_error(tmp_path): + """Test that reading a shapefile without image_path column raises an error. + + This documents the expected behavior when no image_path is provided. + """ + # Create a simple shapefile without image_path column + sample_geometry = [geometry.Point(10, 20), geometry.Point(20, 40)] + labels = ["Tree", "Tree"] + df = pd.DataFrame({"geometry": sample_geometry, "label": labels}) + gdf = gpd.GeoDataFrame(df, geometry="geometry") + shp_path = tmp_path / "no_image_path.shp" + gdf.to_file(str(shp_path)) + + # Should raise ValueError when image_path is not provided + with pytest.raises(ValueError, match="No image_path column found"): + utilities.read_file(input=str(shp_path))