microsoft · subrata-ms · Dec 10, 2025 · Dec 5, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h
@@ -458,8 +458,67 @@ inline std::wstring Utf8ToWString(const std::string& str) {
         return {};
     return result;
 #else
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return converter.from_bytes(str);
+    // Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
+    if (str.empty())
+        return {};
+
+    // Lambda to decode UTF-8 multi-byte sequences
+    constexpr auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
+        unsigned char byte = data[i];
+
+        // 1-byte sequence (ASCII): 0xxxxxxx
+        if (byte <= 0x7F) {
+            ++i;
+            return static_cast<wchar_t>(byte);
+        }
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
+            uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
+            i += 2;
+            return static_cast<wchar_t>(cp);
+        }
+        // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+        if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
+            uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
+                           ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
+            i += 3;
+            return static_cast<wchar_t>(cp);
+        }
+        // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
+            uint32_t cp =
+                ((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
+                 ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
+            i += 4;
+            return static_cast<wchar_t>(cp);
+        }
+        // Invalid sequence - skip byte
+        ++i;
+        return 0xFFFD;  // Unicode replacement character
+    };
+
+    std::wstring result;
+    result.reserve(str.size());  // Reserve assuming mostly ASCII
+
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
+    const size_t len = str.size();
+    size_t i = 0;
+
+    // Fast path for ASCII-only prefix (most common case)
+    while (i < len && data[i] <= 0x7F) {
+        result.push_back(static_cast<wchar_t>(data[i]));
+        ++i;
+    }
+
+    // Handle remaining multi-byte sequences
+    while (i < len) {
+        wchar_t wc = decodeUtf8(data, i, len);
+        if (wc != 0xFFFD || data[i - 1] >= 0x80) {  // Skip invalid sequences
+            result.push_back(wc);
+        }
+    }
+
+    return result;
 #endif
 }
 

diff --git a/mssql_python/pybind/unix_utils.cpp b/mssql_python/pybind/unix_utils.cpp
@@ -13,75 +13,111 @@
 #include <vector>
 
 #if defined(__APPLE__) || defined(__linux__)
+
 // Constants for character encoding
 const char* kOdbcEncoding = "utf-16-le";  // ODBC uses UTF-16LE for SQLWCHAR
 const size_t kUcsLength = 2;              // SQLWCHAR is 2 bytes on all platforms
 
-// Function to convert SQLWCHAR strings to std::wstring on macOS
+// Function to convert SQLWCHAR strings to std::wstring on macOS/Linux
+// Optimized version: direct conversion without intermediate buffer
 std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
     if (!sqlwStr) {
         return std::wstring();
     }
 
+    // Lambda to calculate string length using pointer arithmetic
+    auto calculateLength = [](const SQLWCHAR* str) -> size_t {
+        const SQLWCHAR* p = str;
+        while (*p)
+            ++p;
+        return p - str;
+    };
+
     if (length == SQL_NTS) {
-        // Determine length if not provided
-        size_t i = 0;
-        while (sqlwStr[i] != 0)
-            ++i;
-        length = i;
+        length = calculateLength(sqlwStr);
     }
 
-    // Create a UTF-16LE byte array from the SQLWCHAR array
-    std::vector<char> utf16Bytes(length * kUcsLength);
-    for (size_t i = 0; i < length; ++i) {
-        // Copy each SQLWCHAR (2 bytes) to the byte array
-        memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
+    if (length == 0) {
+        return std::wstring();
     }
 
-    // Convert UTF-16LE to std::wstring (UTF-32 on macOS)
-    try {
-        // Use C++11 codecvt to convert between UTF-16LE and wstring
-        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
-            converter;
-        std::wstring result = converter.from_bytes(
-            reinterpret_cast<const char*>(utf16Bytes.data()),
-            reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
-        return result;
-    } catch (const std::exception& e) {
-        // Fallback to character-by-character conversion if codecvt fails
-        std::wstring result;
-        result.reserve(length);
-        for (size_t i = 0; i < length; ++i) {
-            result.push_back(static_cast<wchar_t>(sqlwStr[i]));
+    // Lambda to check if character is in Basic Multilingual Plane
+    auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };
+
+    // Lambda to decode surrogate pair into code point
+    auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
+        return 0x10000 + (static_cast<uint32_t>(high & 0x3FF) << 10) + (low & 0x3FF);
+    };
+
+    // Convert UTF-16 to UTF-32 directly without intermediate buffer
+    std::wstring result;
+    result.reserve(length);  // Reserve assuming most chars are BMP
+
+    size_t i = 0;
+    while (i < length) {
+        uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);
+
+        // Fast path: BMP character (most common - ~99% of strings)
+        if (isBMP(utf16Char)) {
+            result.push_back(static_cast<wchar_t>(utf16Char));
+            ++i;
+        }
+        // Handle surrogate pairs for characters outside BMP
+        else if (utf16Char <= 0xDBFF) {  // High surrogate
+            if (i + 1 < length) {
+                uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
+                if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
+                    uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
+                    result.push_back(static_cast<wchar_t>(codePoint));
+                    i += 2;
+                    continue;
+                }
+            }
+            // Invalid surrogate - push as-is
+            result.push_back(static_cast<wchar_t>(utf16Char));
+            ++i;
+        } else {  // Low surrogate without high - invalid but push as-is
+            result.push_back(static_cast<wchar_t>(utf16Char));
+            ++i;
         }
-        return result;
     }
+    return result;
 }
 
-// Function to convert std::wstring to SQLWCHAR array on macOS
+// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
+// Optimized version: streamlined conversion with better branch prediction
 std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
-    try {
-        // Convert wstring (UTF-32 on macOS) to UTF-16LE bytes
-        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
-            converter;
-        std::string utf16Bytes = converter.to_bytes(str);
-
-        // Convert the bytes to SQLWCHAR array
-        std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
-                                     0);  // +1 for null terminator
-        for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
-            memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
+    if (str.empty()) {
+        return std::vector<SQLWCHAR>(1, 0);  // Just null terminator
+    }
+
+    // Lambda to encode code point as surrogate pair and append to result
+    auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
+        cp -= 0x10000;
+        vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
+        vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
+    };
+
+    // Convert wstring (UTF-32) to UTF-16
+    std::vector<SQLWCHAR> result;
+    result.reserve(str.size() + 1);  // Most chars are BMP, so reserve exact size
+
+    for (wchar_t wc : str) {
+        uint32_t codePoint = static_cast<uint32_t>(wc);
+
+        // Fast path: BMP character (most common - ~99% of strings)
+        if (codePoint <= 0xFFFF) {
+            result.push_back(static_cast<SQLWCHAR>(codePoint));
         }
-        return result;
-    } catch (const std::exception& e) {
-        // Fallback to simple casting if codecvt fails
-        std::vector<SQLWCHAR> result(str.size() + 1,
-                                     0);  // +1 for null terminator
-        for (size_t i = 0; i < str.size(); ++i) {
-            result[i] = static_cast<SQLWCHAR>(str[i]);
+        // Encode as surrogate pair for characters outside BMP
+        else if (codePoint <= 0x10FFFF) {
+            encodeSurrogatePair(result, codePoint);
         }
-        return result;
+        // Invalid code points silently skipped
     }
+
+    result.push_back(0);  // Null terminator
+    return result;
 }
 
 #endif