resolving co-pilot review comment

subrata-ms · subrata-ms · commit 9c1d92a73577 · 2025-12-09T07:52:28.000Z
diff --git a/mssql_python/pybind/ddbc_bindings.h b/mssql_python/pybind/ddbc_bindings.h
@@ -459,11 +459,9 @@ inline std::wstring Utf8ToWString(const std::string& str) {
     return result;
 #else
     // Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
-    if (str.empty())
-        return {};
 
     // Lambda to decode UTF-8 multi-byte sequences
-    constexpr auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
+    auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
         unsigned char byte = data[i];
 
         // 1-byte sequence (ASCII): 0xxxxxxx
@@ -473,24 +471,58 @@ inline std::wstring Utf8ToWString(const std::string& str) {
         }
         // 2-byte sequence: 110xxxxx 10xxxxxx
         if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
+            // Validate continuation byte has correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation byte
+            }
             uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
-            i += 2;
-            return static_cast<wchar_t>(cp);
+            // Reject overlong encodings (must be >= 0x80)
+            if (cp >= 0x80) {
+                i += 2;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding - invalid
+            ++i;
+            return 0xFFFD;
         }
         // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
         if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
+            // Validate continuation bytes have correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation bytes
+            }
             uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
                            ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
-            i += 3;
-            return static_cast<wchar_t>(cp);
+            // Reject overlong encodings (must be >= 0x800) and surrogates (0xD800-0xDFFF)
+            if (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) {
+                i += 3;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding or surrogate - invalid
+            ++i;
+            return 0xFFFD;
         }
         // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
+            // Validate continuation bytes have correct bit pattern (10xxxxxx)
+            if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 ||
+                (data[i + 3] & 0xC0) != 0x80) {
+                ++i;
+                return 0xFFFD;  // Invalid continuation bytes
+            }
             uint32_t cp =
                 ((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
                  ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
-            i += 4;
-            return static_cast<wchar_t>(cp);
+            // Reject overlong encodings (must be >= 0x10000) and values above max Unicode
+            if (cp >= 0x10000 && cp <= 0x10FFFF) {
+                i += 4;
+                return static_cast<wchar_t>(cp);
+            }
+            // Overlong encoding or out of range - invalid
+            ++i;
+            return 0xFFFD;
         }
         // Invalid sequence - skip byte
         ++i;
@@ -513,9 +545,9 @@ inline std::wstring Utf8ToWString(const std::string& str) {
     // Handle remaining multi-byte sequences
     while (i < len) {
         wchar_t wc = decodeUtf8(data, i, len);
-        if (wc != 0xFFFD || data[i - 1] >= 0x80) {  // Skip invalid sequences
-            result.push_back(wc);
-        }
+        // Always push the decoded character (including 0xFFFD replacement characters)
+        // This correctly handles both legitimate 0xFFFD in input and invalid sequences
+        result.push_back(wc);
     }
 
     return result;
diff --git a/mssql_python/pybind/unix_utils.cpp b/mssql_python/pybind/unix_utils.cpp
@@ -14,12 +14,17 @@
 
 #if defined(__APPLE__) || defined(__linux__)
 
+// Unicode constants for validation
+constexpr uint32_t kUnicodeReplacementChar = 0xFFFD;
+constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF;
+
 // Constants for character encoding
 const char* kOdbcEncoding = "utf-16-le";  // ODBC uses UTF-16LE for SQLWCHAR
 const size_t kUcsLength = 2;              // SQLWCHAR is 2 bytes on all platforms
 
 // Function to convert SQLWCHAR strings to std::wstring on macOS/Linux
-// Optimized version: direct conversion without intermediate buffer
+// Converts UTF-16 (SQLWCHAR) to UTF-32 (wstring on Unix)
+// Invalid surrogates (unpaired high/low) are replaced with U+FFFD
 std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
     if (!sqlwStr) {
         return std::wstring();
@@ -73,19 +78,20 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
                     continue;
                 }
             }
-            // Invalid surrogate - push as-is
-            result.push_back(static_cast<wchar_t>(utf16Char));
+            // Invalid surrogate - replace with Unicode replacement character
+            result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
             ++i;
-        } else {  // Low surrogate without high - invalid but push as-is
-            result.push_back(static_cast<wchar_t>(utf16Char));
+        } else {  // Low surrogate without high - invalid, replace with replacement character
+            result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
             ++i;
         }
     }
     return result;
 }
 
 // Function to convert std::wstring to SQLWCHAR array on macOS/Linux
-// Optimized version: streamlined conversion with better branch prediction
+// Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
+// Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
 std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
     if (str.empty()) {
         return std::vector<SQLWCHAR>(1, 0);  // Just null terminator
@@ -98,22 +104,35 @@ std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
         vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
     };
 
+    // Lambda to check if code point is a valid Unicode scalar value
+    auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
+        // Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
+        return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF);
+    };
+
     // Convert wstring (UTF-32) to UTF-16
     std::vector<SQLWCHAR> result;
     result.reserve(str.size() + 1);  // Most chars are BMP, so reserve exact size
 
     for (wchar_t wc : str) {
         uint32_t codePoint = static_cast<uint32_t>(wc);
 
+        // Validate code point first
+        if (!isValidUnicodeScalar(codePoint)) {
+            codePoint = kUnicodeReplacementChar;
+        }
+
         // Fast path: BMP character (most common - ~99% of strings)
+        // After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
         if (codePoint <= 0xFFFF) {
             result.push_back(static_cast<SQLWCHAR>(codePoint));
         }
         // Encode as surrogate pair for characters outside BMP
-        else if (codePoint <= 0x10FFFF) {
+        else if (codePoint <= kUnicodeMaxCodePoint) {
             encodeSurrogatePair(result, codePoint);
         }
-        // Invalid code points silently skipped
+        // Note: Invalid code points (surrogates and > 0x10FFFF) already
+        // replaced with replacement character (0xFFFD) at validation above
     }
 
     result.push_back(0);  // Null terminator
diff --git a/tests/test_002_types.py b/tests/test_002_types.py