microsoft · subrata-ms · Dec 10, 2025 · Dec 5, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml
@@ -1395,14 +1395,12 @@ jobs:
 
   - script: |
       # Create a Docker container for testing on x86_64
-      # TODO(AB#40901): Temporary pin to 3.22 due to msodbcsql ARM64 package arch mismatch
-      # Revert to alpine:latest once ODBC team releases fixed ARM64 package
       docker run -d --name test-container-alpine \
         --platform linux/amd64 \
         -v $(Build.SourcesDirectory):/workspace \
         -w /workspace \
         --network bridge \
-        alpine:3.22 \
+        alpine:latest \
         tail -f /dev/null
     displayName: 'Create Alpine x86_64 container'
 

diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h
@@ -458,8 +458,99 @@ inline std::wstring Utf8ToWString(const std::string& str) {
         return {};
     return result;
 #else
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
+    // Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
+
+    // Lambda to decode UTF-8 multi-byte sequences
+    auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
+        unsigned char byte = data[i];
+
+        // 1-byte sequence (ASCII): 0xxxxxxx
+        if (byte <= 0x7F) {
+            ++i;
+            return static_cast<wchar_t>(byte);
+        }
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
+            // Validate continuation byte has correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation byte
+            }
+            uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
+            // Reject overlong encodings (must be >= 0x80)
+            if (cp >= 0x80) {
+                i += 2;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding - invalid
+            ++i;
+            return 0xFFFD;
+        }
+        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+        if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
+            // Validate continuation bytes have correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation bytes
+            }
+            uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
+                           ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
+            // Reject overlong encodings (must be >= 0x800) and surrogates (0xD800-0xDFFF)
+            if (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) {
+                i += 3;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding or surrogate - invalid
+            ++i;
+            return 0xFFFD;
+        }
+        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
+            // Validate continuation bytes have correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 ||
+                (data[i + 3] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation bytes
+            }
+            uint32_t cp =
+                ((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
+                 ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
+            // Reject overlong encodings (must be >= 0x10000) and values above max Unicode
+            if (cp >= 0x10000 && cp <= 0x10FFFF) {
+                i += 4;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding or out of range - invalid
+            ++i;
+            return 0xFFFD;
+        }
+        // Invalid sequence - skip byte
+        ++i;
+        return 0xFFFD;  // Unicode replacement character
+    };
+
+    std::wstring result;
+    result.reserve(str.size());  // Reserve assuming mostly ASCII
+
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
+    const size_t len = str.size();
+    size_t i = 0;
+
+    // Fast path for ASCII-only prefix (most common case)
+    while (i < len && data[i] <= 0x7F) {
+        result.push_back(static_cast<wchar_t>(data[i]));
+        ++i;
+    }
+
+    // Handle remaining multi-byte sequences
+    while (i < len) {
+        wchar_t wc = decodeUtf8(data, i, len);
+        // Always push the decoded character (including 0xFFFD replacement characters)
+        // This correctly handles both legitimate 0xFFFD in input and invalid sequences
+        result.push_back(wc);
+    }
+
+    return result;
 #endif
 }
 

diff --git a/mssql_python/pybind/unix_utils.cpp b/mssql_python/pybind/unix_utils.cpp
@@ -13,6 +13,11 @@
 #include <vector>
 
 #if defined(__APPLE__) || defined(__linux__)
+
+// Unicode constants for validation
+constexpr uint32_t kUnicodeReplacementChar = 0xFFFD;
+constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF;
+
 // Constants for character encoding
 const char* kOdbcEncoding = "utf-16-le";  // ODBC uses UTF-16LE for SQLWCHAR
 const size_t kUcsLength = 2;              // SQLWCHAR is 2 bytes on all platforms
@@ -24,74 +29,113 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
         return std::wstring();
     }
 
+    // Lambda to calculate string length using pointer arithmetic
+    auto calculateLength = [](const SQLWCHAR* str) -> size_t {
+        const SQLWCHAR* p = str;
+        while (*p)
+            ++p;
+        return p - str;
+    };
+
     if (length == SQL_NTS) {
-        // Determine length if not provided
-        size_t i = 0;
-        while (sqlwStr[i] != 0)
-            ++i;
-        length = i;
+        length = calculateLength(sqlwStr);
     }
 
-    // Create a UTF-16LE byte array from the SQLWCHAR array
-    std::vector<char> utf16Bytes(length * kUcsLength);
-    for (size_t i = 0; i < length; ++i) {
-        // Copy each SQLWCHAR (2 bytes) to the byte array
-        memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
+    if (length == 0) {
+        return std::wstring();
     }
 
-    // Convert UTF-16LE to std::wstring (UTF-32 on macOS)
-    try {
-        // CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
-        // std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
-        // Each thread gets its own converter instance, eliminating race conditions
-        thread_local std::wstring_convert<
-            std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
-            converter;
-
-        std::wstring result = converter.from_bytes(
-            reinterpret_cast<const char*>(utf16Bytes.data()),
-            reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
-        return result;
-    } catch (const std::exception& e) {
-        // Fallback to character-by-character conversion if codecvt fails
-        std::wstring result;
-        result.reserve(length);
-        for (size_t i = 0; i < length; ++i) {
-            result.push_back(static_cast<wchar_t>(sqlwStr[i]));
+    // Lambda to check if character is in Basic Multilingual Plane
+    auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };
+
+    // Lambda to decode surrogate pair into code point
+    auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
+        return 0x10000 + (static_cast<uint32_t>(high & 0x3FF) << 10) + (low & 0x3FF);
+    };
+
+    // Convert UTF-16 to UTF-32 directly without intermediate buffer
+    std::wstring result;
+    result.reserve(length);  // Reserve assuming most chars are BMP
+
+    size_t i = 0;
+    while (i < length) {
+        uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);
+
+        // Fast path: BMP character (most common - ~99% of strings)
+        if (isBMP(utf16Char)) {
+            result.push_back(static_cast<wchar_t>(utf16Char));
+            ++i;
+        }
+        // Handle surrogate pairs for characters outside BMP
+        else if (utf16Char <= 0xDBFF) {  // High surrogate
+            if (i + 1 < length) {
+                uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
+                if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
+                    uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
+                    result.push_back(static_cast<wchar_t>(codePoint));
+                    i += 2;
+                    continue;
+                }
+            }
+            // Invalid surrogate - replace with Unicode replacement character
+            result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
+            ++i;
+        } else {  // Low surrogate without high - invalid, replace with replacement character
+            result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
+            ++i;
         }
-        return result;
     }
+    return result;
 }
 
-// Function to convert std::wstring to SQLWCHAR array on macOS
-// THREAD-SAFE: Uses thread_local converter to avoid std::wstring_convert race conditions
+// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
+// Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
+// Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
 std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
-    try {
-        // CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
-        // std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
-        // Each thread gets its own converter instance, eliminating race conditions
-        thread_local std::wstring_convert<
-            std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
-            converter;
-
-        std::string utf16Bytes = converter.to_bytes(str);
-
-        // Convert the bytes to SQLWCHAR array
-        std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
-                                     0);  // +1 for null terminator
-        for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
-            memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
+    if (str.empty()) {
+        return std::vector<SQLWCHAR>(1, 0);  // Just null terminator
+    }
+
+    // Lambda to encode code point as surrogate pair and append to result
+    auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
+        cp -= 0x10000;
+        vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
+        vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
+    };
+
+    // Lambda to check if code point is a valid Unicode scalar value
+    auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
+        // Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
+        return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF);
+    };
+
+    // Convert wstring (UTF-32) to UTF-16
+    std::vector<SQLWCHAR> result;
+    result.reserve(str.size() + 1);  // Most chars are BMP, so reserve exact size
+
+    for (wchar_t wc : str) {
+        uint32_t codePoint = static_cast<uint32_t>(wc);
+
+        // Validate code point first
+        if (!isValidUnicodeScalar(codePoint)) {
+            codePoint = kUnicodeReplacementChar;
         }
-        return result;
-    } catch (const std::exception& e) {
-        // Fallback to simple casting if codecvt fails
-        std::vector<SQLWCHAR> result(str.size() + 1,
-                                     0);  // +1 for null terminator
-        for (size_t i = 0; i < str.size(); ++i) {
-            result[i] = static_cast<SQLWCHAR>(str[i]);
+
+        // Fast path: BMP character (most common - ~99% of strings)
+        // After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
+        if (codePoint <= 0xFFFF) {
+            result.push_back(static_cast<SQLWCHAR>(codePoint));
         }
-        return result;
+        // Encode as surrogate pair for characters outside BMP
+        else if (codePoint <= kUnicodeMaxCodePoint) {
+            encodeSurrogatePair(result, codePoint);
+        }
+        // Note: Invalid code points (surrogates and > 0x10FFFF) already
+        // replaced with replacement character (0xFFFD) at validation above
     }
+
+    result.push_back(0);  // Null terminator
+    return result;
 }
 
 #endif