@@ -458,8 +458,69 @@ inline std::wstring Utf8ToWString(const std::string& str) {
458458 return {};
459459 return result;
460460#else
461- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t >> converter;
462- return converter.from_bytes (str);
461+ // Optimized UTF-8 to UTF-32 conversion (wstring on Unix)
462+ if (str.empty ())
463+ return {};
464+
465+ // Lambda to decode UTF-8 multi-byte sequences
466+ constexpr auto decodeUtf8 = [](const unsigned char * data, size_t & i, size_t len) -> wchar_t {
467+ unsigned char byte = data[i];
468+
469+ // 1-byte sequence (ASCII): 0xxxxxxx
470+ if (byte <= 0x7F ) {
471+ ++i;
472+ return static_cast <wchar_t >(byte);
473+ }
474+ // 2-byte sequence: 110xxxxx 10xxxxxx
475+ if ((byte & 0xE0 ) == 0xC0 && i + 1 < len) {
476+ uint32_t cp = ((static_cast <uint32_t >(byte & 0x1F ) << 6 ) | (data[i + 1 ] & 0x3F ));
477+ i += 2 ;
478+ return static_cast <wchar_t >(cp);
479+ }
480+ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
481+ if ((byte & 0xF0 ) == 0xE0 && i + 2 < len) {
482+ uint32_t cp = ((static_cast <uint32_t >(byte & 0x0F ) << 12 ) |
483+ ((data[i + 1 ] & 0x3F ) << 6 ) |
484+ (data[i + 2 ] & 0x3F ));
485+ i += 3 ;
486+ return static_cast <wchar_t >(cp);
487+ }
488+ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
489+ if ((byte & 0xF8 ) == 0xF0 && i + 3 < len) {
490+ uint32_t cp = ((static_cast <uint32_t >(byte & 0x07 ) << 18 ) |
491+ ((data[i + 1 ] & 0x3F ) << 12 ) |
492+ ((data[i + 2 ] & 0x3F ) << 6 ) |
493+ (data[i + 3 ] & 0x3F ));
494+ i += 4 ;
495+ return static_cast <wchar_t >(cp);
496+ }
497+ // Invalid sequence - skip byte
498+ ++i;
499+ return 0xFFFD ; // Unicode replacement character
500+ };
501+
502+ std::wstring result;
503+ result.reserve (str.size ()); // Reserve assuming mostly ASCII
504+
505+ const unsigned char * data = reinterpret_cast <const unsigned char *>(str.data ());
506+ const size_t len = str.size ();
507+ size_t i = 0 ;
508+
509+ // Fast path for ASCII-only prefix (most common case)
510+ while (i < len && data[i] <= 0x7F ) {
511+ result.push_back (static_cast <wchar_t >(data[i]));
512+ ++i;
513+ }
514+
515+ // Handle remaining multi-byte sequences
516+ while (i < len) {
517+ wchar_t wc = decodeUtf8 (data, i, len);
518+ if (wc != 0xFFFD || data[i - 1 ] >= 0x80 ) { // Skip invalid sequences
519+ result.push_back (wc);
520+ }
521+ }
522+
523+ return result;
463524#endif
464525}
465526
0 commit comments