1414
1515#if defined(__APPLE__) || defined(__linux__)
1616
17+ // Unicode constants for validation
18+ constexpr uint32_t kUnicodeReplacementChar = 0xFFFD ;
19+ constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF ;
20+
1721// Constants for character encoding
1822const char * kOdbcEncoding = " utf-16-le" ; // ODBC uses UTF-16LE for SQLWCHAR
1923const size_t kUcsLength = 2 ; // SQLWCHAR is 2 bytes on all platforms
2024
2125// Function to convert SQLWCHAR strings to std::wstring on macOS/Linux
22- // Optimized version: direct conversion without intermediate buffer
26+ // Converts UTF-16 (SQLWCHAR) to UTF-32 (wstring on Unix)
27+ // Invalid surrogates (unpaired high/low) are replaced with U+FFFD
2328std::wstring SQLWCHARToWString (const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
2429 if (!sqlwStr) {
2530 return std::wstring ();
@@ -73,19 +78,20 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
7378 continue ;
7479 }
7580 }
76- // Invalid surrogate - push as-is
77- result.push_back (static_cast <wchar_t >(utf16Char ));
81+ // Invalid surrogate - replace with Unicode replacement character
82+ result.push_back (static_cast <wchar_t >(kUnicodeReplacementChar ));
7883 ++i;
79- } else { // Low surrogate without high - invalid but push as-is
80- result.push_back (static_cast <wchar_t >(utf16Char ));
84+ } else { // Low surrogate without high - invalid, replace with replacement character
85+ result.push_back (static_cast <wchar_t >(kUnicodeReplacementChar ));
8186 ++i;
8287 }
8388 }
8489 return result;
8590}
8691
8792// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
88- // Optimized version: streamlined conversion with better branch prediction
93+ // Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
94+ // Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
8995std::vector<SQLWCHAR> WStringToSQLWCHAR (const std::wstring& str) {
9096 if (str.empty ()) {
9197 return std::vector<SQLWCHAR>(1 , 0 ); // Just null terminator
@@ -98,22 +104,35 @@ std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
98104 vec.push_back (static_cast <SQLWCHAR>(0xDC00 | (cp & 0x3FF )));
99105 };
100106
107+ // Lambda to check if code point is a valid Unicode scalar value
108+ auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
109+ // Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
110+ return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF );
111+ };
112+
101113 // Convert wstring (UTF-32) to UTF-16
102114 std::vector<SQLWCHAR> result;
103115 result.reserve (str.size () + 1 ); // Most chars are BMP, so reserve exact size
104116
105117 for (wchar_t wc : str) {
106118 uint32_t codePoint = static_cast <uint32_t >(wc);
107119
120+ // Validate code point first
121+ if (!isValidUnicodeScalar (codePoint)) {
122+ codePoint = kUnicodeReplacementChar ;
123+ }
124+
108125 // Fast path: BMP character (most common - ~99% of strings)
126+ // After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
109127 if (codePoint <= 0xFFFF ) {
110128 result.push_back (static_cast <SQLWCHAR>(codePoint));
111129 }
112130 // Encode as surrogate pair for characters outside BMP
113- else if (codePoint <= 0x10FFFF ) {
131+ else if (codePoint <= kUnicodeMaxCodePoint ) {
114132 encodeSurrogatePair (result, codePoint);
115133 }
116- // Invalid code points silently skipped
134+ // Note: Invalid code points (surrogates and > 0x10FFFF) already
135+ // replaced with replacement character (0xFFFD) at validation above
117136 }
118137
119138 result.push_back (0 ); // Null terminator
0 commit comments