Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1dcf9ce
unix utility function fixes
Dec 5, 2025
00260d9
formatting fix
Dec 8, 2025
17b64fc
formate fix
Dec 8, 2025
65d1224
Formate fix
Dec 8, 2025
c281fd3
removing depricated function from ddbc binding
Dec 8, 2025
8850b21
linting fix for ddbc binding
Dec 8, 2025
9ff1de0
comprehensive test cases for UTF-8 conversion
Dec 8, 2025
9c1d92a
resolving co-pilot review comment
Dec 9, 2025
1263895
Merge branch 'main' into subrata-ms/DepricatedFixLinux
subrata-ms Dec 9, 2025
6c59791
pipeline versionning fix
Dec 9, 2025
0eecf67
Code coverage for ddbc_bindings.h
Dec 9, 2025
419b024
cross platform failure fix
subrata-ms Dec 9, 2025
ac56363
unicode char fix for windows
Dec 9, 2025
d695289
Fix Windows CI encoding issue - simplify safe_print to use ASCII dire…
Dec 9, 2025
76d6828
unicode fix for strict assert
subrata-ms Dec 9, 2025
1ca9f73
Merge: Resolve conflict by keeping flexible cross-platform UTF-8 test…
subrata-ms Dec 9, 2025
a4e87a4
fixing test error
subrata-ms Dec 9, 2025
aff37ca
linting fix for test_002_types
subrata-ms Dec 9, 2025
75f374b
skip test for failed scenario
subrata-ms Dec 9, 2025
d03055a
fixing skip test1
subrata-ms Dec 9, 2025
ae6c021
fixing skip test1
subrata-ms Dec 9, 2025
c52fbc6
removing print statement from the test
subrata-ms Dec 10, 2025
59b89c4
cleanning up unnecessary print
subrata-ms Dec 10, 2025
de2791d
improving test coverage
subrata-ms Dec 10, 2025
a7d8697
test coverage for ddbc binding
subrata-ms Dec 10, 2025
be4b70e
fixing the linting issue
subrata-ms Dec 10, 2025
ab15ef9
Merge branch 'main' into subrata-ms/DepricatedFixLinux
subrata-ms Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions eng/pipelines/pr-validation-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1395,14 +1395,12 @@ jobs:

- script: |
# Create a Docker container for testing on x86_64
# TODO(AB#40901): Temporary pin to 3.22 due to msodbcsql ARM64 package arch mismatch
# Revert to alpine:latest once ODBC team releases fixed ARM64 package
docker run -d --name test-container-alpine \
--platform linux/amd64 \
-v $(Build.SourcesDirectory):/workspace \
-w /workspace \
--network bridge \
alpine:3.22 \
alpine:latest \
tail -f /dev/null
displayName: 'Create Alpine x86_64 container'

Expand Down
95 changes: 93 additions & 2 deletions mssql_python/pybind/ddbc_bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,99 @@ inline std::wstring Utf8ToWString(const std::string& str) {
return {};
return result;
#else
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
// Optimized UTF-8 to UTF-32 conversion (wstring on Unix)

// Lambda to decode UTF-8 multi-byte sequences
auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
unsigned char byte = data[i];

// 1-byte sequence (ASCII): 0xxxxxxx
if (byte <= 0x7F) {
++i;
return static_cast<wchar_t>(byte);
}
// 2-byte sequence: 110xxxxx 10xxxxxx
if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
// Validate continuation byte has correct bit pattern (10xxxxxx)
if ((data[i + 1] & 0xC0) != 0x80) {
++i;
return 0xFFFD; // Invalid continuation byte
}
uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
// Reject overlong encodings (must be >= 0x80)
if (cp >= 0x80) {
i += 2;
return static_cast<wchar_t>(cp);
}
// Overlong encoding - invalid
++i;
return 0xFFFD;
}
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
// Validate continuation bytes have correct bit pattern (10xxxxxx)
if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) {
++i;
return 0xFFFD; // Invalid continuation bytes
}
uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
// Reject overlong encodings (must be >= 0x800) and surrogates (0xD800-0xDFFF)
if (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) {
i += 3;
return static_cast<wchar_t>(cp);
}
// Overlong encoding or surrogate - invalid
++i;
return 0xFFFD;
}
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
// Validate continuation bytes have correct bit pattern (10xxxxxx)
if ((data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 ||
(data[i + 3] & 0xC0) != 0x80) {
++i;
return 0xFFFD; // Invalid continuation bytes
}
uint32_t cp =
((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
// Reject overlong encodings (must be >= 0x10000) and values above max Unicode
if (cp >= 0x10000 && cp <= 0x10FFFF) {
i += 4;
return static_cast<wchar_t>(cp);
}
// Overlong encoding or out of range - invalid
++i;
return 0xFFFD;
}
// Invalid sequence - skip byte
++i;
return 0xFFFD; // Unicode replacement character
};

std::wstring result;
result.reserve(str.size()); // Reserve assuming mostly ASCII

const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
const size_t len = str.size();
size_t i = 0;

// Fast path for ASCII-only prefix (most common case)
while (i < len && data[i] <= 0x7F) {
result.push_back(static_cast<wchar_t>(data[i]));
++i;
}

// Handle remaining multi-byte sequences
while (i < len) {
wchar_t wc = decodeUtf8(data, i, len);
// Always push the decoded character (including 0xFFFD replacement characters)
// This correctly handles both legitimate 0xFFFD in input and invalid sequences
result.push_back(wc);
}

return result;
#endif
}

Expand Down
154 changes: 99 additions & 55 deletions mssql_python/pybind/unix_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
#include <vector>

#if defined(__APPLE__) || defined(__linux__)

// Unicode constants for validation
constexpr uint32_t kUnicodeReplacementChar = 0xFFFD;
constexpr uint32_t kUnicodeMaxCodePoint = 0x10FFFF;

// Constants for character encoding
const char* kOdbcEncoding = "utf-16-le"; // ODBC uses UTF-16LE for SQLWCHAR
const size_t kUcsLength = 2; // SQLWCHAR is 2 bytes on all platforms
Expand All @@ -24,74 +29,113 @@ std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS)
return std::wstring();
}

// Lambda to calculate string length using pointer arithmetic
auto calculateLength = [](const SQLWCHAR* str) -> size_t {
const SQLWCHAR* p = str;
while (*p)
++p;
return p - str;
};

if (length == SQL_NTS) {
// Determine length if not provided
size_t i = 0;
while (sqlwStr[i] != 0)
++i;
length = i;
length = calculateLength(sqlwStr);
}

// Create a UTF-16LE byte array from the SQLWCHAR array
std::vector<char> utf16Bytes(length * kUcsLength);
for (size_t i = 0; i < length; ++i) {
// Copy each SQLWCHAR (2 bytes) to the byte array
memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
if (length == 0) {
return std::wstring();
}

// Convert UTF-16LE to std::wstring (UTF-32 on macOS)
try {
// CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
// std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
// Each thread gets its own converter instance, eliminating race conditions
thread_local std::wstring_convert<
std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
converter;

std::wstring result = converter.from_bytes(
reinterpret_cast<const char*>(utf16Bytes.data()),
reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
return result;
} catch (const std::exception& e) {
// Fallback to character-by-character conversion if codecvt fails
std::wstring result;
result.reserve(length);
for (size_t i = 0; i < length; ++i) {
result.push_back(static_cast<wchar_t>(sqlwStr[i]));
// Lambda to check if character is in Basic Multilingual Plane
auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };

// Lambda to decode surrogate pair into code point
auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
return 0x10000 + (static_cast<uint32_t>(high & 0x3FF) << 10) + (low & 0x3FF);
};

// Convert UTF-16 to UTF-32 directly without intermediate buffer
std::wstring result;
result.reserve(length); // Reserve assuming most chars are BMP

size_t i = 0;
while (i < length) {
uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);

// Fast path: BMP character (most common - ~99% of strings)
if (isBMP(utf16Char)) {
result.push_back(static_cast<wchar_t>(utf16Char));
++i;
}
// Handle surrogate pairs for characters outside BMP
else if (utf16Char <= 0xDBFF) { // High surrogate
if (i + 1 < length) {
uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
result.push_back(static_cast<wchar_t>(codePoint));
i += 2;
continue;
}
}
// Invalid surrogate - replace with Unicode replacement character
result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
++i;
} else { // Low surrogate without high - invalid, replace with replacement character
result.push_back(static_cast<wchar_t>(kUnicodeReplacementChar));
++i;
}
return result;
}
return result;
}

// Function to convert std::wstring to SQLWCHAR array on macOS
// THREAD-SAFE: Uses thread_local converter to avoid std::wstring_convert race conditions
// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
// Converts UTF-32 (wstring on Unix) to UTF-16 (SQLWCHAR)
// Invalid Unicode scalars (surrogates, values > 0x10FFFF) are replaced with U+FFFD
std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
try {
// CRITICAL FIX: Use thread_local to make std::wstring_convert thread-safe
// std::wstring_convert is NOT thread-safe and its use is deprecated in C++17
// Each thread gets its own converter instance, eliminating race conditions
thread_local std::wstring_convert<
std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
converter;

std::string utf16Bytes = converter.to_bytes(str);

// Convert the bytes to SQLWCHAR array
std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
0); // +1 for null terminator
for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
if (str.empty()) {
return std::vector<SQLWCHAR>(1, 0); // Just null terminator
}

// Lambda to encode code point as surrogate pair and append to result
auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
cp -= 0x10000;
vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
};

// Lambda to check if code point is a valid Unicode scalar value
auto isValidUnicodeScalar = [](uint32_t cp) -> bool {
// Exclude surrogate range (0xD800-0xDFFF) and values beyond max Unicode
return cp <= kUnicodeMaxCodePoint && (cp < 0xD800 || cp > 0xDFFF);
};

// Convert wstring (UTF-32) to UTF-16
std::vector<SQLWCHAR> result;
result.reserve(str.size() + 1); // Most chars are BMP, so reserve exact size

for (wchar_t wc : str) {
uint32_t codePoint = static_cast<uint32_t>(wc);

// Validate code point first
if (!isValidUnicodeScalar(codePoint)) {
codePoint = kUnicodeReplacementChar;
}
return result;
} catch (const std::exception& e) {
// Fallback to simple casting if codecvt fails
std::vector<SQLWCHAR> result(str.size() + 1,
0); // +1 for null terminator
for (size_t i = 0; i < str.size(); ++i) {
result[i] = static_cast<SQLWCHAR>(str[i]);

// Fast path: BMP character (most common - ~99% of strings)
// After validation, codePoint cannot be in surrogate range (0xD800-0xDFFF)
if (codePoint <= 0xFFFF) {
result.push_back(static_cast<SQLWCHAR>(codePoint));
}
return result;
// Encode as surrogate pair for characters outside BMP
else if (codePoint <= kUnicodeMaxCodePoint) {
encodeSurrogatePair(result, codePoint);
}
// Note: Invalid code points (surrogates and > 0x10FFFF) already
// replaced with replacement character (0xFFFD) at validation above
}

result.push_back(0); // Null terminator
return result;
}

#endif
Loading
Loading