Skip to content

Commit f119d05

Browse files
subrata-mssubrata-ms
andauthored
FIX: Fix for deprecated lib function wstring_convert (#365)
### Work Item / Issue Reference <!-- mssql-python maintainers: ADO Work Item --> > [AB#40879](https://sqlclientdrivers.visualstudio.com/c6d89619-62de-46a0-8b46-70b92a84d85e/_workitems/edit/40879) <!-- External contributors: GitHub Issue --> > GitHub Issue: #<ISSUE_NUMBER> ------------------------------------------------------------------- ### Summary <!-- Insert your summary of changes below. Minimum 10 characters required. --> This pull request refactors and optimizes the string conversion utilities in `unix_utils.cpp` for converting between `SQLWCHAR` arrays and `std::wstring` on macOS/Linux. The new implementation eliminates intermediate buffers and reliance on `codecvt`, resulting in more efficient and robust conversions, especially for Unicode characters outside the Basic Multilingual Plane (BMP). **String conversion optimizations:** * Replaced the previous `SQLWCHARToWString` implementation with a direct UTF-16 to UTF-32 conversion, handling surrogate pairs explicitly and removing the use of `std::wstring_convert` and intermediate buffers. * Improved the `WStringToSQLWCHAR` function to convert `std::wstring` (UTF-32) to UTF-16, encoding surrogate pairs manually and streamlining the conversion logic for better performance and branch prediction. **Robustness and correctness improvements:** * Added explicit handling for invalid surrogate pairs and code points, ensuring that malformed input does not cause conversion failures or exceptions. * Ensured that both conversion functions always append a null terminator to the output, maintaining compatibility with ODBC expectations. **Code simplification:** * Removed exception handling and fallback code paths by providing a single, reliable conversion strategy for both directions. (F2cac280L17 --------- Co-authored-by: subrata-ms <subrata@microsoft.com>
1 parent eb95d2e commit f119d05

File tree

6 files changed

+2301
-60
lines changed

6 files changed

+2301
-60
lines changed

eng/pipelines/pr-validation-pipeline.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,14 +1395,12 @@ jobs:
13951395
13961396
- script: |
13971397
# Create a Docker container for testing on x86_64
1398-
# TODO(AB#40901): Temporary pin to 3.22 due to msodbcsql ARM64 package arch mismatch
1399-
# Revert to alpine:latest once ODBC team releases fixed ARM64 package
14001398
docker run -d --name test-container-alpine \
14011399
--platform linux/amd64 \
14021400
-v $(Build.SourcesDirectory):/workspace \
14031401
-w /workspace \
14041402
--network bridge \
1405-
alpine:3.22 \
1403+
alpine:latest \
14061404
tail -f /dev/null
14071405
displayName: 'Create Alpine x86_64 container'
14081406

mssql_python/pybind/ddbc_bindings.h

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,99 @@ inline std::wstring Utf8ToWString(const std::string& str) {
458458
return {};
459459
return result;
460460
#else
461-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
462-
return converter.from_bytes(str);
461+
// Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
462+
463+
// Lambda to decode UTF-8 multi-byte sequences
464+
auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
465+
unsigned char byte = data[i];
466+
467+
// 1-byte sequence (ASCII): 0xxxxxxx
468+
if (byte <= 0x7F) {
469+
++i;
470+
return static_cast<wchar_t>(byte);
471+
}
472+
// 2-byte sequence: 110xxxxx 10xxxxxx
473+
if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
474+
// Validate continuation byte has correct bit pattern (10xxxxxx)
475+
if ((data[i + 1] & 0xC0) != 0x80) {
476+
++i;
477+
return 0xFFFD; // Invalid continuation byte
478+
}
479+
uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
480+
// Reject overlong encodings (must be >= 0x80)
481+
if (cp >= 0x80) {
482+
i += 2;
483+
return static_cast<wchar_t>(cp);
484+
}
485+
// Overlong encoding - invalid
486+
++i;
487+
return 0xFFFD;
488+
}
489+
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
490+
if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
491+
// Validate continuation bytes have correct bit pattern (10xxxxxx)
492+
if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) {
493+
++i;
494+
return 0xFFFD; // Invalid continuation bytes
495+
}
496+
uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
497+
((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
498+
// Reject overlong encodings (must be >= 0x800) and surrogates (0xD800-0xDFFF)
499+
if (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) {
500+
i += 3;
501+
return static_cast<wchar_t>(cp);
502+
}
503+
// Overlong encoding or surrogate - invalid
504+
++i;
505+
return 0xFFFD;
506+
}
507+
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
508+
if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
509+
// Validate continuation bytes have correct bit pattern (10xxxxxx)
510+
if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 ||
511+
(data[i + 3] & 0xC0) != 0x80) {
512+
++i;
513+
return 0xFFFD; // Invalid continuation bytes
514+
}
515+
uint32_t cp =
516+
((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
517+
((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
518+
// Reject overlong encodings (must be >= 0x10000) and values above max Unicode
519+
if (cp >= 0x10000 && cp <= 0x10FFFF) {
520+
i += 4;
521+
return static_cast<wchar_t>(cp);
522+
}
523+
// Overlong encoding or out of range - invalid
524+
++i;
525+
return 0xFFFD;
526+
}
527+
// Invalid sequence - skip byte
528+
++i;
529+
return 0xFFFD; // Unicode replacement character
530+
};
531+
532+
std::wstring result;
533+
result.reserve(str.size()); // Reserve assuming mostly ASCII
534+
535+
const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
536+
const size_t len = str.size();
537+
size_t i = 0;
538+
539+
// Fast path for ASCII-only prefix (most common case)
540+
while (i < len && data[i] <= 0x7F) {
541+
result.push_back(static_cast<wchar_t>(data[i]));
542+
++i;
543+
}
544+
545+
// Handle remaining multi-byte sequences
546+
while (i < len) {
547+
wchar_t wc = decodeUtf8(data, i, len);
548+
// Always push the decoded character (including 0xFFFD replacement characters)
549+
// This correctly handles both legitimate 0xFFFD in input and invalid sequences
550+
result.push_back(wc);
551+
}
552+
553+
return result;
463554
#endif
464555
}
465556

mssql_python/pybind/unix_utils.cpp

Lines changed: 99 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
#include <vector>
1414

1515
#if defined(__APPLE__) || defined(__linux__)
16+
17+
// Unicode constants for validation
18+
constexpr uint32_t kUnicodeReplacementChar = 0xFFFD;
19+
constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF;
20+
1621
// Constants for character encoding
1722
const char* kOdbcEncoding = "utf-16-le"; // ODBC uses UTF-16LE for SQLWCHAR
1823
const size_t kUcsLength = 2; // SQLWCHAR is 2 bytes on all platforms
@@ -24,74 +29,113 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
2429
return std::wstring();
2530
}
2631

32+
// Lambda to calculate string length using pointer arithmetic
33+
auto calculateLength = [](const SQLWCHAR* str) -> size_t {
34+
const SQLWCHAR* p = str;
35+
while (*p)
36+
++p;
37+
return p - str;
38+
};
39+
2740
if (length == SQL_NTS) {
28-
// Determine length if not provided
29-
size_t i = 0;
30-
while (sqlwStr[i] != 0)
31-
++i;
32-
length = i;
41+
length = calculateLength(sqlwStr);
3342
}
3443

35-
// Create a UTF-16LE byte array from the SQLWCHAR array
36-
std::vector<char> utf16Bytes(length * kUcsLength);
37-
for (size_t i = 0; i < length; ++i) {
38-
// Copy each SQLWCHAR (2 bytes) to the byte array
39-
memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
44+
if (length == 0) {
45+
return std::wstring();
4046
}
4147

42-
// Convert UTF-16LE to std::wstring (UTF-32 on macOS)
43-
try {
44-
// CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
45-
// std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
46-
// Each thread gets its own converter instance, eliminating race conditions
47-
thread_local std::wstring_convert<
48-
std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
49-
converter;
50-
51-
std::wstring result = converter.from_bytes(
52-
reinterpret_cast<const char*>(utf16Bytes.data()),
53-
reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
54-
return result;
55-
} catch (const std::exception& e) {
56-
// Fallback to character-by-character conversion if codecvt fails
57-
std::wstring result;
58-
result.reserve(length);
59-
for (size_t i = 0; i < length; ++i) {
60-
result.push_back(static_cast<wchar_t>(sqlwStr[i]));
48+
// Lambda to check if character is in Basic Multilingual Plane
49+
auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };
50+
51+
// Lambda to decode surrogate pair into code point
52+
auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
53+
return 0x10000 + (static_cast<uint32_t>(high & 0x3FF) << 10) + (low & 0x3FF);
54+
};
55+
56+
// Convert UTF-16 to UTF-32 directly without intermediate buffer
57+
std::wstring result;
58+
result.reserve(length); // Reserve assuming most chars are BMP
59+
60+
size_t i = 0;
61+
while (i < length) {
62+
uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);
63+
64+
// Fast path: BMP character (most common - ~99% of strings)
65+
if (isBMP(utf16Char)) {
66+
result.push_back(static_cast<wchar_t>(utf16Char));
67+
++i;
68+
}
69+
// Handle surrogate pairs for characters outside BMP
70+
else if (utf16Char <= 0xDBFF) { // High surrogate
71+
if (i + 1 < length) {
72+
uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
73+
if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
74+
uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
75+
result.push_back(static_cast<wchar_t>(codePoint));
76+
i += 2;
77+
continue;
78+
}
79+
}
80+
// Invalid surrogate - replace with Unicode replacement character
81+
result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
82+
++i;
83+
} else { // Low surrogate without high - invalid, replace with replacement character
84+
result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
85+
++i;
6186
}
62-
return result;
6387
}
88+
return result;
6489
}
6590

66-
// Function to convert std::wstring to SQLWCHAR array on macOS
67-
// THREAD-SAFE: Uses thread_local converter to avoid std::wstring_convert race conditions
91+
// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
92+
// Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
93+
// Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
6894
std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
69-
try {
70-
// CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
71-
// std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
72-
// Each thread gets its own converter instance, eliminating race conditions
73-
thread_local std::wstring_convert<
74-
std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
75-
converter;
76-
77-
std::string utf16Bytes = converter.to_bytes(str);
78-
79-
// Convert the bytes to SQLWCHAR array
80-
std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
81-
0); // +1 for null terminator
82-
for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
83-
memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
95+
if (str.empty()) {
96+
return std::vector<SQLWCHAR>(1, 0); // Just null terminator
97+
}
98+
99+
// Lambda to encode code point as surrogate pair and append to result
100+
auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
101+
cp -= 0x10000;
102+
vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
103+
vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
104+
};
105+
106+
// Lambda to check if code point is a valid Unicode scalar value
107+
auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
108+
// Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
109+
return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF);
110+
};
111+
112+
// Convert wstring (UTF-32) to UTF-16
113+
std::vector<SQLWCHAR> result;
114+
result.reserve(str.size() + 1); // Most chars are BMP, so reserve exact size
115+
116+
for (wchar_t wc : str) {
117+
uint32_t codePoint = static_cast<uint32_t>(wc);
118+
119+
// Validate code point first
120+
if (!isValidUnicodeScalar(codePoint)) {
121+
codePoint = kUnicodeReplacementChar;
84122
}
85-
return result;
86-
} catch (const std::exception& e) {
87-
// Fallback to simple casting if codecvt fails
88-
std::vector<SQLWCHAR> result(str.size() + 1,
89-
0); // +1 for null terminator
90-
for (size_t i = 0; i < str.size(); ++i) {
91-
result[i] = static_cast<SQLWCHAR>(str[i]);
123+
124+
// Fast path: BMP character (most common - ~99% of strings)
125+
// After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
126+
if (codePoint <= 0xFFFF) {
127+
result.push_back(static_cast<SQLWCHAR>(codePoint));
92128
}
93-
return result;
129+
// Encode as surrogate pair for characters outside BMP
130+
else if (codePoint <= kUnicodeMaxCodePoint) {
131+
encodeSurrogatePair(result, codePoint);
132+
}
133+
// Note: Invalid code points (surrogates and > 0x10FFFF) already
134+
// replaced with replacement character (0xFFFD) at validation above
94135
}
136+
137+
result.push_back(0); // Null terminator
138+
return result;
95139
}
96140

97141
#endif

0 commit comments

Comments
 (0)