From 8ad5b6cc3edfdae69329fe1403ed6dd2c9044d1e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 13 Oct 2025 14:06:30 +0200
Subject: [PATCH] Fix #502: Support Unicode characters in file paths using
 os.fsencode()

---
 src/blosc2/blosc2_ext.pyx   |  16 ++-
 tests/test_unicode_paths.py | 216 ++++++++++++++++++++++++++++++++++++
 2 files changed, 229 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_unicode_paths.py

diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx
index ba74b5df..36e3297c 100644
--- a/src/blosc2/blosc2_ext.pyx
+++ b/src/blosc2/blosc2_ext.pyx
@@ -1000,7 +1000,8 @@ cdef class SChunk:
         if urlpath is not None:
             if isinstance(urlpath, pathlib.PurePath):
                 urlpath = str(urlpath)
-            self._urlpath = urlpath.encode() if isinstance(urlpath, str) else urlpath
+            # Use os.fsencode for proper platform-specific path encoding
+            self._urlpath = os.fsencode(urlpath) if isinstance(urlpath, str) else urlpath
             kwargs["urlpath"] = self._urlpath
 
         self.mode = blosc2.Storage().mode if kwargs.get("mode", None) is None else kwargs.get("mode")
@@ -1856,6 +1857,9 @@ cdef int general_prefilter(blosc2_prefilter_params *params):
 
 
 def remove_urlpath(path):
+    # Use os.fsencode for proper platform-specific path encoding
+    if isinstance(path, str):
+        path = os.fsencode(path)
     blosc2_remove_urlpath(path)
 
 
@@ -1972,7 +1976,12 @@ def meta_keys(self):
 
 
 def open(urlpath, mode, offset, **kwargs):
-    urlpath_ = urlpath.encode("utf-8") if isinstance(urlpath, str) else urlpath
+    # Use os.fsencode for proper platform-specific path encoding
+    # On Windows, this handles non-ASCII characters correctly
+    if isinstance(urlpath, str):
+        urlpath_ = os.fsencode(urlpath)
+    else:
+        urlpath_ = urlpath
     cdef blosc2_schunk* schunk
     cdef blosc2_stdio_mmap* mmap_file
     cdef blosc2_io* io
@@ -2731,7 +2740,8 @@ cdef b2nd_context_t* create_b2nd_context(shape, chunks, blocks, dtype, kwargs):
     if urlpath is not None:
         if isinstance(urlpath, pathlib.PurePath):
             urlpath = str(urlpath)
-        _urlpath = urlpath.encode() if isinstance(urlpath, str) else urlpath
+        # Use os.fsencode for proper platform-specific path encoding
+        _urlpath = os.fsencode(urlpath) if isinstance(urlpath, str) else urlpath
         kwargs["urlpath"] = _urlpath
 
     if kwargs.get("mmap_mode") is not None:
diff --git a/tests/test_unicode_paths.py b/tests/test_unicode_paths.py
new file mode 100644
index 00000000..3a13848b
--- /dev/null
+++ b/tests/test_unicode_paths.py
@@ -0,0 +1,216 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# This source code is licensed under a BSD-style license (found in the
+# LICENSE file in the root directory of this source tree)
+#######################################################################
+
+"""
+Tests for Unicode path support (Issue #502)
+
+This module tests that blosc2 can properly handle file paths containing
+non-ASCII characters (e.g., Chinese, emoji, accented characters).
+"""
+
+import os
+import sys
+
+import numpy as np
+import pytest
+
+import blosc2
+
+# Test filenames with various Unicode characters
+UNICODE_FILENAMES = [
+    "test_ascii.b2nd",  # Baseline: ASCII
+    "test_测试.b2nd",  # Chinese characters
+    "test_тест.b2nd",  # Cyrillic characters
+    "test_δοκιμή.b2nd",  # Greek characters
+    "test_テスト.b2nd",  # Japanese characters
+    "test_café.b2nd",  # Accented characters
+    "test_🎉.b2nd",  # Emoji (may not work on all systems)
+]
+
+
+@pytest.mark.parametrize("filename", UNICODE_FILENAMES)
+def test_unicode_path_schunk(tmp_path, filename):
+    """Test SChunk creation and opening with Unicode paths"""
+    # Skip emoji test on Windows due to C runtime limitations
+    if "🎉" in filename and sys.platform == "win32":
+        pytest.skip("Emoji paths may not work on Windows C runtime")
+
+    urlpath = str(tmp_path / filename)
+
+    # Create a SChunk
+    data = b"Hello, World! " * 100
+    schunk = blosc2.SChunk(
+        chunksize=len(data) // 4,
+        data=data,
+        urlpath=urlpath,
+        mode="w",
+        contiguous=True,
+    )
+
+    # Verify it was created
+    assert os.path.exists(urlpath), f"File not created: {urlpath}"
+
+    # Open the SChunk
+    schunk2 = blosc2.open(urlpath)
+
+    # Verify data integrity
+    assert schunk.nchunks == schunk2.nchunks
+    assert schunk[:] == schunk2[:]
+
+
+@pytest.mark.parametrize("filename", UNICODE_FILENAMES)
+def test_unicode_path_ndarray(tmp_path, filename):
+    """Test NDArray creation and opening with Unicode paths"""
+    # Skip emoji test on Windows due to C runtime limitations
+    if "🎉" in filename and sys.platform == "win32":
+        pytest.skip("Emoji paths may not work on Windows C runtime")
+
+    urlpath = str(tmp_path / filename)
+
+    # Create an NDArray
+    data = np.arange(1000, dtype=np.int32)
+    arr = blosc2.asarray(data, urlpath=urlpath, mode="w")
+
+    # Verify it was created
+    assert os.path.exists(urlpath), f"File not created: {urlpath}"
+
+    # Open the NDArray
+    arr2 = blosc2.open(urlpath)
+
+    # Verify data integrity
+    assert arr.shape == arr2.shape
+    assert np.array_equal(arr[:], arr2[:])
+
+
+@pytest.mark.parametrize(
+    "dirname",
+    [
+        "测试目录",  # Chinese directory name
+        "тест_папка",  # Cyrillic directory name
+        "café_dir",  # Accented directory name
+    ],
+)
+def test_unicode_directory_path(tmp_path, dirname):
+    """Test creating files in directories with Unicode names"""
+    dir_path = tmp_path / dirname
+    dir_path.mkdir()
+
+    urlpath = str(dir_path / "test_array.b2nd")
+
+    # Create an NDArray in the Unicode-named directory
+    data = np.arange(100, dtype=np.int32)
+    arr = blosc2.asarray(data, urlpath=urlpath, mode="w")
+
+    # Open the NDArray
+    arr2 = blosc2.open(urlpath)
+
+    # Verify data integrity
+    assert np.array_equal(arr[:], arr2[:])
+
+
+def test_unicode_path_remove(tmp_path):
+    """Test that remove_urlpath works with Unicode paths"""
+    filename = "测试_remove.b2nd"
+    urlpath = str(tmp_path / filename)
+
+    # Create a file
+    data = np.arange(100, dtype=np.int32)
+    blosc2.asarray(data, urlpath=urlpath, mode="w")
+
+    # Verify it exists
+    assert os.path.exists(urlpath)
+
+    # Remove it using blosc2's remove_urlpath
+    blosc2.remove_urlpath(urlpath)
+
+    # Verify it was removed
+    assert not os.path.exists(urlpath)
+
+
+def test_unicode_path_modes(tmp_path):
+    """Test different modes (r, w, a) with Unicode paths"""
+    filename = "测试_modes.b2nd"
+    urlpath = str(tmp_path / filename)
+
+    # Test 'w' mode - create new
+    data1 = np.arange(100, dtype=np.int32)
+    arr1 = blosc2.asarray(data1, urlpath=urlpath, mode="w")
+    del arr1
+
+    # Test 'r' mode - read only
+    arr2 = blosc2.open(urlpath, mode="r")
+    assert np.array_equal(arr2[:], data1)
+    del arr2
+
+    # Test 'a' mode - append/modify
+    arr3 = blosc2.open(urlpath, mode="a")
+    assert np.array_equal(arr3[:], data1)
+    del arr3
+
+
+@pytest.mark.parametrize("mmap_mode", ["r", "r+", "c"])
+def test_unicode_path_mmap(tmp_path, mmap_mode):
+    """Test memory-mapped mode with Unicode paths"""
+    if sys.platform == "win32" and mmap_mode == "c":
+        pytest.skip("Cannot test mmap_mode 'c' on Windows")
+
+    filename = "测试_mmap.b2frame"
+    urlpath = str(tmp_path / filename)
+
+    # Create a SChunk with mmap_mode
+    data = b"Test data " * 100
+    schunk = blosc2.SChunk(
+        chunksize=len(data) // 4,
+        data=data,
+        urlpath=urlpath,
+        mmap_mode="w+",
+        contiguous=True,
+    )
+    del schunk
+
+    # Open with specified mmap_mode
+    schunk2 = blosc2.open(urlpath, mmap_mode=mmap_mode)
+
+    # Verify we can read the data
+    assert len(schunk2[:]) > 0
+
+
+def test_unicode_path_pathlib(tmp_path):
+    """Test that pathlib.Path objects with Unicode work"""
+
+    filename = "测试_pathlib.b2nd"
+    urlpath = tmp_path / filename
+
+    # Create with pathlib.Path
+    data = np.arange(100, dtype=np.int32)
+    arr = blosc2.asarray(data, urlpath=urlpath, mode="w")
+
+    # Open with pathlib.Path
+    arr2 = blosc2.open(urlpath)
+
+    # Verify
+    assert np.array_equal(arr[:], arr2[:])
+
+
+def test_unicode_path_mixed_characters(tmp_path):
+    """Test path with mixed Unicode scripts"""
+    filename = "测试_test_テスト_тест_δοκιμή.b2nd"
+    urlpath = str(tmp_path / filename)
+
+    # Create an array
+    data = np.arange(100, dtype=np.int32)
+    arr = blosc2.asarray(data, urlpath=urlpath, mode="w")
+
+    # Open and verify
+    arr2 = blosc2.open(urlpath)
+    assert np.array_equal(arr[:], arr2[:])
+
+
+if __name__ == "__main__":
+    # Run tests with pytest
+    pytest.main([__file__, "-v"])