Skip to content

Commit c281fd3

Browse files
author
subrata-ms
committed
removing depricated function from ddbc binding
1 parent 65d1224 commit c281fd3

File tree

1 file changed

+63
-2
lines changed

1 file changed

+63
-2
lines changed

mssql_python/pybind/ddbc_bindings.h

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -458,8 +458,69 @@ inline std::wstring Utf8ToWString(const std::string& str) {
458458
return {};
459459
return result;
460460
#else
461-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
462-
return converter.from_bytes(str);
461+
// Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
462+
if (str.empty())
463+
return {};
464+
465+
// Lambda to decode UTF-8 multi-byte sequences
466+
constexpr auto decodeUtf8 = [](const unsigned char* data, size_t& i, size_t len) -> wchar_t {
467+
unsigned char byte = data[i];
468+
469+
// 1-byte sequence (ASCII): 0xxxxxxx
470+
if (byte <= 0x7F) {
471+
++i;
472+
return static_cast<wchar_t>(byte);
473+
}
474+
// 2-byte sequence: 110xxxxx 10xxxxxx
475+
if ((byte & 0xE0) == 0xC0 && i + 1 < len) {
476+
uint32_t cp = ((static_cast<uint32_t>(byte & 0x1F) << 6) | (data[i + 1] & 0x3F));
477+
i += 2;
478+
return static_cast<wchar_t>(cp);
479+
}
480+
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
481+
if ((byte & 0xF0) == 0xE0 && i + 2 < len) {
482+
uint32_t cp = ((static_cast<uint32_t>(byte & 0x0F) << 12) |
483+
((data[i + 1] & 0x3F) << 6) |
484+
(data[i + 2] & 0x3F));
485+
i += 3;
486+
return static_cast<wchar_t>(cp);
487+
}
488+
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
489+
if ((byte & 0xF8) == 0xF0 && i + 3 < len) {
490+
uint32_t cp = ((static_cast<uint32_t>(byte & 0x07) << 18) |
491+
((data[i + 1] & 0x3F) << 12) |
492+
((data[i + 2] & 0x3F) << 6) |
493+
(data[i + 3] & 0x3F));
494+
i += 4;
495+
return static_cast<wchar_t>(cp);
496+
}
497+
// Invalid sequence - skip byte
498+
++i;
499+
return 0xFFFD; // Unicode replacement character
500+
};
501+
502+
std::wstring result;
503+
result.reserve(str.size()); // Reserve assuming mostly ASCII
504+
505+
const unsigned char* data = reinterpret_cast<const unsigned char*>(str.data());
506+
const size_t len = str.size();
507+
size_t i = 0;
508+
509+
// Fast path for ASCII-only prefix (most common case)
510+
while (i < len && data[i] <= 0x7F) {
511+
result.push_back(static_cast<wchar_t>(data[i]));
512+
++i;
513+
}
514+
515+
// Handle remaining multi-byte sequences
516+
while (i < len) {
517+
wchar_t wc = decodeUtf8(data, i, len);
518+
if (wc != 0xFFFD || data[i - 1] >= 0x80) { // Skip invalid sequences
519+
result.push_back(wc);
520+
}
521+
}
522+
523+
return result;
463524
#endif
464525
}
465526

0 commit comments

Comments
 (0)