From a76cd12c8f31934f0b4632d88cb4ae2dc3ecff49 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Fri, 21 Nov 2025 16:42:52 +0530 Subject: [PATCH 1/4] FIX: Decode Raw UTF-16 data from Conn.getinfo() --- mssql_python/connection.py | 28 ++++++++++----------- mssql_python/pybind/build.bat | 47 +++++++++++++++++++++-------------- tests/test_003_connection.py | 27 ++++++++++++++++++++ 3 files changed, 69 insertions(+), 33 deletions(-) diff --git a/mssql_python/connection.py b/mssql_python/connection.py index 2082f530..8954a146 100644 --- a/mssql_python/connection.py +++ b/mssql_python/connection.py @@ -1198,21 +1198,21 @@ def getinfo(self, info_type: int) -> Union[str, int, bool, None]: # Make sure we use the correct amount of data based on length actual_data = data[:length] - # Now decode the string data - try: - return actual_data.decode("utf-8").rstrip("\0") - except UnicodeDecodeError: + # SQLGetInfoW returns UTF-16LE encoded strings + # Try encodings in order: UTF-16LE (Windows), UTF-8, Latin-1 + for encoding in ("utf-16-le", "utf-8", "latin1"): try: - return actual_data.decode("latin1").rstrip("\0") - except Exception as e: - logger.debug( - "error", - "Failed to decode string in getinfo: %s. " - "Returning None to avoid silent corruption.", - e, - ) - # Explicitly return None to signal decoding failure - return None + return actual_data.decode(encoding).rstrip("\0") + except UnicodeDecodeError: + continue + + # All decodings failed + logger.debug( + "error", + "Failed to decode string in getinfo with any supported encoding. " + "Returning None to avoid silent corruption.", + ) + return None else: # If it's not bytes, return as is return data diff --git a/mssql_python/pybind/build.bat b/mssql_python/pybind/build.bat index 90241c05..715c0532 100644 --- a/mssql_python/pybind/build.bat +++ b/mssql_python/pybind/build.bat @@ -1,10 +1,30 @@ @echo off setlocal enabledelayedexpansion -REM Usage: build.bat [ARCH], If ARCH is not specified, it defaults to x64. +REM Usage: build.bat [ARCH] [MODE], If ARCH is not specified, it defaults to x64. +REM MODE can be 'profile' or '--profile' to enable profiling instrumentation set ARCH=%1 if "%ARCH%"=="" set ARCH=x64 + +REM Check for profiling mode +set PROFILING_MODE=0 +if /i "%2"=="profile" set PROFILING_MODE=1 +if /i "%2"=="--profile" set PROFILING_MODE=1 +if /i "%1"=="profile" ( + set PROFILING_MODE=1 + set ARCH=x64 +) +if /i "%1"=="--profile" ( + set PROFILING_MODE=1 + set ARCH=x64 +) + echo [DIAGNOSTIC] Target Architecture set to: %ARCH% +if %PROFILING_MODE%==1 ( + echo [MODE] C++ Profiling: ENABLED +) else ( + echo [MODE] C++ Profiling: DISABLED +) REM Clean up main build directory if it exists echo Checking for main build directory... @@ -109,8 +129,13 @@ if errorlevel 1 ( ) REM Now invoke CMake with correct source path (options first, path last!) -echo [DIAGNOSTIC] Running CMake configure with: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" -cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" +if %PROFILING_MODE%==1 ( + echo [DIAGNOSTIC] Running CMake configure with profiling: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% -DCMAKE_CXX_FLAGS="/DENABLE_PROFILING" -DCMAKE_C_FLAGS="/DENABLE_PROFILING" "%SOURCE_DIR:~0,-1%" + cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% -DCMAKE_CXX_FLAGS="/DENABLE_PROFILING" -DCMAKE_C_FLAGS="/DENABLE_PROFILING" "%SOURCE_DIR:~0,-1%" +) else ( + echo [DIAGNOSTIC] Running CMake configure with: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" + cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" +) echo [DIAGNOSTIC] CMake configure exit code: %errorlevel% if errorlevel 1 ( echo [ERROR] CMake configuration failed @@ -157,22 +182,6 @@ if exist "%OUTPUT_DIR%\%PYD_NAME%" ( echo [WARNING] PDB file !PDB_NAME! not found in output directory. ) - setlocal enabledelayedexpansion - for %%I in ("%SOURCE_DIR%..") do ( - set PARENT_DIR=%%~fI - ) - echo [DIAGNOSTIC] Parent is: !PARENT_DIR! - - set VCREDIST_DLL_PATH=!PARENT_DIR!\libs\windows\!ARCH!\vcredist\msvcp140.dll - echo [DIAGNOSTIC] Looking for msvcp140.dll at "!VCREDIST_DLL_PATH!" - - if exist "!VCREDIST_DLL_PATH!" ( - copy /Y "!VCREDIST_DLL_PATH!" "%SOURCE_DIR%\.." - echo [SUCCESS] Copied msvcp140.dll from !VCREDIST_DLL_PATH! to "%SOURCE_DIR%\.." - ) else ( - echo [ERROR] Could not find msvcp140.dll at "!VCREDIST_DLL_PATH!" - exit /b 1 - ) ) else ( echo [ERROR] Could not find built .pyd file: %PYD_NAME% REM Exit with an error code here if the .pyd file is not found diff --git a/tests/test_003_connection.py b/tests/test_003_connection.py index 8db506bc..6b225833 100644 --- a/tests/test_003_connection.py +++ b/tests/test_003_connection.py @@ -5471,6 +5471,33 @@ def test_getinfo_basic_driver_info(db_connection): pytest.fail(f"getinfo failed for basic driver info: {e}") +def test_getinfo_string_encoding_utf16(db_connection): + """Test that string values from getinfo are properly decoded from UTF-16.""" + + # Test string info types that should not contain null bytes + string_info_types = [ + ("SQL_DRIVER_VER", sql_const.SQL_DRIVER_VER.value), + ("SQL_DRIVER_NAME", sql_const.SQL_DRIVER_NAME.value), + ("SQL_DRIVER_ODBC_VER", sql_const.SQL_DRIVER_ODBC_VER.value), + ("SQL_SERVER_NAME", sql_const.SQL_SERVER_NAME.value), + ] + + for name, info_type in string_info_types: + result = db_connection.getinfo(info_type) + + if result is not None: + # Verify it's a string + assert isinstance(result, str), \ + f"{name}: Expected str, got {type(result).__name__}" + + # Verify no null bytes (indicates UTF-16 decoded as UTF-8 bug) + assert '\x00' not in result, \ + f"{name} contains null bytes, likely UTF-16/UTF-8 encoding mismatch: {repr(result)}" + + # Verify it's not empty (optional, but good sanity check) + assert len(result) > 0, f"{name} returned empty string" + + def test_getinfo_sql_support(db_connection): """Test SQL support and conformance info types.""" From 706e33135411f336057165ceeee161d4b0a19e32 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Fri, 21 Nov 2025 17:10:41 +0530 Subject: [PATCH 2/4] FIX: Decode Raw UTF-16 data from Conn.getinfo() --- mssql_python/pybind/build.bat | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/mssql_python/pybind/build.bat b/mssql_python/pybind/build.bat index 715c0532..f264f686 100644 --- a/mssql_python/pybind/build.bat +++ b/mssql_python/pybind/build.bat @@ -1,30 +1,10 @@ @echo off setlocal enabledelayedexpansion -REM Usage: build.bat [ARCH] [MODE], If ARCH is not specified, it defaults to x64. -REM MODE can be 'profile' or '--profile' to enable profiling instrumentation +REM Usage: build.bat [ARCH], If ARCH is not specified, it defaults to x64. set ARCH=%1 if "%ARCH%"=="" set ARCH=x64 - -REM Check for profiling mode -set PROFILING_MODE=0 -if /i "%2"=="profile" set PROFILING_MODE=1 -if /i "%2"=="--profile" set PROFILING_MODE=1 -if /i "%1"=="profile" ( - set PROFILING_MODE=1 - set ARCH=x64 -) -if /i "%1"=="--profile" ( - set PROFILING_MODE=1 - set ARCH=x64 -) - echo [DIAGNOSTIC] Target Architecture set to: %ARCH% -if %PROFILING_MODE%==1 ( - echo [MODE] C++ Profiling: ENABLED -) else ( - echo [MODE] C++ Profiling: DISABLED -) REM Clean up main build directory if it exists echo Checking for main build directory... @@ -129,13 +109,8 @@ if errorlevel 1 ( ) REM Now invoke CMake with correct source path (options first, path last!) -if %PROFILING_MODE%==1 ( - echo [DIAGNOSTIC] Running CMake configure with profiling: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% -DCMAKE_CXX_FLAGS="/DENABLE_PROFILING" -DCMAKE_C_FLAGS="/DENABLE_PROFILING" "%SOURCE_DIR:~0,-1%" - cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% -DCMAKE_CXX_FLAGS="/DENABLE_PROFILING" -DCMAKE_C_FLAGS="/DENABLE_PROFILING" "%SOURCE_DIR:~0,-1%" -) else ( - echo [DIAGNOSTIC] Running CMake configure with: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" - cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" -) +echo [DIAGNOSTIC] Running CMake configure with: cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" +cmake -A %PLATFORM_NAME% -DARCHITECTURE=%ARCH% "%SOURCE_DIR:~0,-1%" echo [DIAGNOSTIC] CMake configure exit code: %errorlevel% if errorlevel 1 ( echo [ERROR] CMake configuration failed From bf675a71be3d0655bb39d711d65f08c80cf0e916 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Thu, 11 Dec 2025 12:04:11 +0000 Subject: [PATCH 3/4] fix linting --- mssql_python/connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mssql_python/connection.py b/mssql_python/connection.py index 30abae0f..a104ca10 100644 --- a/mssql_python/connection.py +++ b/mssql_python/connection.py @@ -1192,7 +1192,7 @@ def getinfo(self, info_type: int) -> Union[str, int, bool, None]: return actual_data.decode(encoding).rstrip("\0") except UnicodeDecodeError: continue - + # All decodings failed logger.debug( "error", From 48e669a065ad72b8d863ecb90ec8c9eeb6e0c326 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Thu, 11 Dec 2025 12:15:20 +0000 Subject: [PATCH 4/4] more tests and remove latin1 --- mssql_python/connection.py | 9 ++-- tests/test_003_connection.py | 90 ++++++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/mssql_python/connection.py b/mssql_python/connection.py index a104ca10..38df2a98 100644 --- a/mssql_python/connection.py +++ b/mssql_python/connection.py @@ -1185,9 +1185,9 @@ def getinfo(self, info_type: int) -> Union[str, int, bool, None]: # Make sure we use the correct amount of data based on length actual_data = data[:length] - # SQLGetInfoW returns UTF-16LE encoded strings - # Try encodings in order: UTF-16LE (Windows), UTF-8, Latin-1 - for encoding in ("utf-16-le", "utf-8", "latin1"): + # SQLGetInfoW returns UTF-16LE encoded strings (wide-character ODBC API) + # Try UTF-16LE first (expected), then UTF-8 as fallback + for encoding in ("utf-16-le", "utf-8"): try: return actual_data.decode(encoding).rstrip("\0") except UnicodeDecodeError: @@ -1196,8 +1196,9 @@ def getinfo(self, info_type: int) -> Union[str, int, bool, None]: # All decodings failed logger.debug( "error", - "Failed to decode string in getinfo with any supported encoding. " + "Failed to decode string in getinfo (info_type=%d) with supported encodings. " "Returning None to avoid silent corruption.", + info_type, ) return None else: diff --git a/tests/test_003_connection.py b/tests/test_003_connection.py index 88a16ee0..f9918c25 100644 --- a/tests/test_003_connection.py +++ b/tests/test_003_connection.py @@ -5187,7 +5187,7 @@ def test_getinfo_basic_driver_info(db_connection): def test_getinfo_string_encoding_utf16(db_connection): """Test that string values from getinfo are properly decoded from UTF-16.""" - + # Test string info types that should not contain null bytes string_info_types = [ ("SQL_DRIVER_VER", sql_const.SQL_DRIVER_VER.value), @@ -5195,23 +5195,95 @@ def test_getinfo_string_encoding_utf16(db_connection): ("SQL_DRIVER_ODBC_VER", sql_const.SQL_DRIVER_ODBC_VER.value), ("SQL_SERVER_NAME", sql_const.SQL_SERVER_NAME.value), ] - + for name, info_type in string_info_types: result = db_connection.getinfo(info_type) - + if result is not None: # Verify it's a string - assert isinstance(result, str), \ - f"{name}: Expected str, got {type(result).__name__}" - + assert isinstance(result, str), f"{name}: Expected str, got {type(result).__name__}" + # Verify no null bytes (indicates UTF-16 decoded as UTF-8 bug) - assert '\x00' not in result, \ - f"{name} contains null bytes, likely UTF-16/UTF-8 encoding mismatch: {repr(result)}" - + assert ( + "\x00" not in result + ), f"{name} contains null bytes, likely UTF-16/UTF-8 encoding mismatch: {repr(result)}" + # Verify it's not empty (optional, but good sanity check) assert len(result) > 0, f"{name} returned empty string" +def test_getinfo_string_decoding_utf8_fallback(db_connection): + """Test that getinfo falls back to UTF-8 when UTF-16LE decoding fails. + + This test verifies the fallback path in the encoding loop where + UTF-16LE fails but UTF-8 succeeds. + """ + from unittest.mock import patch + + # UTF-8 encoded "Hello" - this is valid UTF-8 but NOT valid UTF-16LE + # (odd number of bytes would fail UTF-16LE decode) + utf8_data = "Hello".encode("utf-8") # b'Hello' - 5 bytes, odd length + + mock_result = {"data": utf8_data, "length": len(utf8_data)} + + # Use a string-type info_type (SQL_DRIVER_NAME = 6 is in string_type_constants) + info_type = sql_const.SQL_DRIVER_NAME.value + + with patch.object(db_connection._conn, "get_info", return_value=mock_result): + result = db_connection.getinfo(info_type) + + assert result == "Hello", f"Expected 'Hello', got {repr(result)}" + assert isinstance(result, str), f"Expected str, got {type(result).__name__}" + + +def test_getinfo_string_decoding_all_fail_returns_none(db_connection): + """Test that getinfo returns None when all decoding attempts fail. + + This test verifies that when both UTF-16LE and UTF-8 decoding fail, + the method returns None to avoid silent data corruption. + """ + from unittest.mock import patch + + # Invalid byte sequence that cannot be decoded as UTF-16LE or UTF-8 + # 0xFF 0xFE is a BOM, but followed by invalid continuation bytes for UTF-8 + # and odd length makes it invalid UTF-16LE + invalid_data = bytes([0x80, 0x81, 0x82]) # Invalid for both encodings + + mock_result = {"data": invalid_data, "length": len(invalid_data)} + + # Use a string-type info_type (SQL_DRIVER_NAME = 6 is in string_type_constants) + info_type = sql_const.SQL_DRIVER_NAME.value + + with patch.object(db_connection._conn, "get_info", return_value=mock_result): + result = db_connection.getinfo(info_type) + + # Should return None when all decoding fails + assert result is None, f"Expected None for invalid encoding, got {repr(result)}" + + +def test_getinfo_string_encoding_utf16_primary(db_connection): + """Test that getinfo correctly decodes valid UTF-16LE data. + + This test verifies the primary (expected) encoding path where + UTF-16LE decoding succeeds on first try. + """ + from unittest.mock import patch + + # Valid UTF-16LE encoded "Test" with null terminator + utf16_data = "Test".encode("utf-16-le") + b"\x00\x00" + + mock_result = {"data": utf16_data, "length": len(utf16_data)} + + # Use a string-type info_type + info_type = sql_const.SQL_DRIVER_NAME.value + + with patch.object(db_connection._conn, "get_info", return_value=mock_result): + result = db_connection.getinfo(info_type) + + assert result == "Test", f"Expected 'Test', got {repr(result)}" + assert "\x00" not in result, f"Result contains null bytes: {repr(result)}" + + def test_getinfo_sql_support(db_connection): """Test SQL support and conformance info types."""