Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1dcf9ce
unix utility function fixes
Dec 5, 2025
00260d9
formatting fix
Dec 8, 2025
17b64fc
formate fix
Dec 8, 2025
65d1224
Formate fix
Dec 8, 2025
c281fd3
removing depricated function from ddbc binding
Dec 8, 2025
8850b21
linting fix for ddbc binding
Dec 8, 2025
9ff1de0
comprehensive test cases for UTF-8 conversion
Dec 8, 2025
9c1d92a
resolving co-pilot review comment
Dec 9, 2025
1263895
Merge branch 'main' into subrata-ms/DepricatedFixLinux
subrata-ms Dec 9, 2025
6c59791
pipeline versionning fix
Dec 9, 2025
0eecf67
Code coverage for ddbc_bindings.h
Dec 9, 2025
419b024
cross platform failure fix
subrata-ms Dec 9, 2025
ac56363
unicode char fix for windows
Dec 9, 2025
d695289
Fix Windows CI encoding issue - simplify safe_print to use ASCII dire…
Dec 9, 2025
76d6828
unicode fix for strict assert
subrata-ms Dec 9, 2025
1ca9f73
Merge: Resolve conflict by keeping flexible cross-platform UTF-8 test…
subrata-ms Dec 9, 2025
a4e87a4
fixing test error
subrata-ms Dec 9, 2025
aff37ca
linting fix for test_002_types
subrata-ms Dec 9, 2025
75f374b
skip test for failed scenario
subrata-ms Dec 9, 2025
d03055a
fixing skip test1
subrata-ms Dec 9, 2025
ae6c021
fixing skip test1
subrata-ms Dec 9, 2025
c52fbc6
removing print statement from the test
subrata-ms Dec 10, 2025
59b89c4
cleanning up unnecessary print
subrata-ms Dec 10, 2025
de2791d
improving test coverage
subrata-ms Dec 10, 2025
a7d8697
test coverage for ddbc binding
subrata-ms Dec 10, 2025
be4b70e
fixing the linting issue
subrata-ms Dec 10, 2025
ab15ef9
Merge branch 'main' into subrata-ms/DepricatedFixLinux
subrata-ms Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions mssql_python/pybind/ddbc_bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,67 @@ inline std::wstring Utf8ToWString(const std::string& str) {
return {};
return result;
#else
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
// Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
if (str.empty())
return {};

// Lambda to decode UTF-8 multi-byte sequences
constexpr auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
unsigned char byte = data[i];

// 1-byte sequence (ASCII): 0xxxxxxx
if (byte <= 0x7F) {
++i;
return static_cast<wchar_t>(byte);
}
// 2-byte sequence: 110xxxxx 10xxxxxx
if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
i += 2;
return static_cast<wchar_t>(cp);
}
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F));
i += 3;
return static_cast<wchar_t>(cp);
}
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
uint32_t cp =
((static_cast<uint32_t>(byte & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) |
((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F));
i += 4;
return static_cast<wchar_t>(cp);
}
// Invalid sequence - skip byte
++i;
return 0xFFFD; // Unicode replacement character
};

std::wstring result;
result.reserve(str.size()); // Reserve assuming mostly ASCII

const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
const size_t len = str.size();
size_t i = 0;

// Fast path for ASCII-only prefix (most common case)
while (i < len && data[i] <= 0x7F) {
result.push_back(static_cast<wchar_t>(data[i]));
++i;
}

// Handle remaining multi-byte sequences
while (i < len) {
wchar_t wc = decodeUtf8(data, i, len);
if (wc != 0xFFFD || data[i - 1] >= 0x80) { // Skip invalid sequences
result.push_back(wc);
}
}

return result;
#endif
}

Expand Down
130 changes: 83 additions & 47 deletions mssql_python/pybind/unix_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,75 +13,111 @@
#include <vector>

#if defined(__APPLE__) || defined(__linux__)

// Constants for character encoding
const char* kOdbcEncoding = "utf-16-le"; // ODBC uses UTF-16LE for SQLWCHAR
const size_t kUcsLength = 2; // SQLWCHAR is 2 bytes on all platforms

// Function to convert SQLWCHAR strings to std::wstring on macOS
// Function to convert SQLWCHAR strings to std::wstring on macOS/Linux
// Optimized version: direct conversion without intermediate buffer
std::wstring SQLWCHARToWString(const SQLWCHAR* sqlwStr, size_t length = SQL_NTS) {
if (!sqlwStr) {
return std::wstring();
}

// Lambda to calculate string length using pointer arithmetic
auto calculateLength = [](const SQLWCHAR* str) -> size_t {
const SQLWCHAR* p = str;
while (*p)
++p;
return p - str;
};

if (length == SQL_NTS) {
// Determine length if not provided
size_t i = 0;
while (sqlwStr[i] != 0)
++i;
length = i;
length = calculateLength(sqlwStr);
}

// Create a UTF-16LE byte array from the SQLWCHAR array
std::vector<char> utf16Bytes(length * kUcsLength);
for (size_t i = 0; i < length; ++i) {
// Copy each SQLWCHAR (2 bytes) to the byte array
memcpy(&utf16Bytes[i * kUcsLength], &sqlwStr[i], kUcsLength);
if (length == 0) {
return std::wstring();
}

// Convert UTF-16LE to std::wstring (UTF-32 on macOS)
try {
// Use C++11 codecvt to convert between UTF-16LE and wstring
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
converter;
std::wstring result = converter.from_bytes(
reinterpret_cast<const char*>(utf16Bytes.data()),
reinterpret_cast<const char*>(utf16Bytes.data() + utf16Bytes.size()));
return result;
} catch (const std::exception& e) {
// Fallback to character-by-character conversion if codecvt fails
std::wstring result;
result.reserve(length);
for (size_t i = 0; i < length; ++i) {
result.push_back(static_cast<wchar_t>(sqlwStr[i]));
// Lambda to check if character is in Basic Multilingual Plane
auto isBMP = [](uint16_t ch) { return ch < 0xD800 || ch > 0xDFFF; };

// Lambda to decode surrogate pair into code point
auto decodeSurrogatePair = [](uint16_t high, uint16_t low) -> uint32_t {
return 0x10000 + (static_cast<uint32_t>(high & 0x3FF) << 10) + (low & 0x3FF);
};

// Convert UTF-16 to UTF-32 directly without intermediate buffer
std::wstring result;
result.reserve(length); // Reserve assuming most chars are BMP

size_t i = 0;
while (i < length) {
uint16_t utf16Char = static_cast<uint16_t>(sqlwStr[i]);

// Fast path: BMP character (most common - ~99% of strings)
if (isBMP(utf16Char)) {
result.push_back(static_cast<wchar_t>(utf16Char));
++i;
}
// Handle surrogate pairs for characters outside BMP
else if (utf16Char <= 0xDBFF) { // High surrogate
if (i + 1 < length) {
uint16_t lowSurrogate = static_cast<uint16_t>(sqlwStr[i + 1]);
if (lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF) {
uint32_t codePoint = decodeSurrogatePair(utf16Char, lowSurrogate);
result.push_back(static_cast<wchar_t>(codePoint));
i += 2;
continue;
}
}
// Invalid surrogate - push as-is
result.push_back(static_cast<wchar_t>(utf16Char));
++i;
} else { // Low surrogate without high - invalid but push as-is
result.push_back(static_cast<wchar_t>(utf16Char));
++i;
}
return result;
}
return result;
}

// Function to convert std::wstring to SQLWCHAR array on macOS
// Function to convert std::wstring to SQLWCHAR array on macOS/Linux
// Optimized version: streamlined conversion with better branch prediction
std::vector<SQLWCHAR> WStringToSQLWCHAR(const std::wstring& str) {
try {
// Convert wstring (UTF-32 on macOS) to UTF-16LE bytes
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::little_endian>>
converter;
std::string utf16Bytes = converter.to_bytes(str);

// Convert the bytes to SQLWCHAR array
std::vector<SQLWCHAR> result(utf16Bytes.size() / kUcsLength + 1,
0); // +1 for null terminator
for (size_t i = 0; i < utf16Bytes.size() / kUcsLength; ++i) {
memcpy(&result[i], &utf16Bytes[i * kUcsLength], kUcsLength);
if (str.empty()) {
return std::vector<SQLWCHAR>(1, 0); // Just null terminator
}

// Lambda to encode code point as surrogate pair and append to result
auto encodeSurrogatePair = [](std::vector<SQLWCHAR>& vec, uint32_t cp) {
cp -= 0x10000;
vec.push_back(static_cast<SQLWCHAR>(0xD800 | ((cp >> 10) & 0x3FF)));
vec.push_back(static_cast<SQLWCHAR>(0xDC00 | (cp & 0x3FF)));
};

// Convert wstring (UTF-32) to UTF-16
std::vector<SQLWCHAR> result;
result.reserve(str.size() + 1); // Most chars are BMP, so reserve exact size

for (wchar_t wc : str) {
uint32_t codePoint = static_cast<uint32_t>(wc);

// Fast path: BMP character (most common - ~99% of strings)
if (codePoint <= 0xFFFF) {
result.push_back(static_cast<SQLWCHAR>(codePoint));
}
return result;
} catch (const std::exception& e) {
// Fallback to simple casting if codecvt fails
std::vector<SQLWCHAR> result(str.size() + 1,
0); // +1 for null terminator
for (size_t i = 0; i < str.size(); ++i) {
result[i] = static_cast<SQLWCHAR>(str[i]);
// Encode as surrogate pair for characters outside BMP
else if (codePoint <= 0x10FFFF) {
encodeSurrogatePair(result, codePoint);
}
return result;
// Invalid code points silently skipped
}

result.push_back(0); // Null terminator
return result;
}

#endif
Loading
Loading