From 2f2aaae4e4fc1c33d6a7db316fcc0cac4df7af3f Mon Sep 17 00:00:00 2001 From: Adeel Mujahid <3840695+am11@users.noreply.github.com> Date: Thu, 22 Jun 2023 16:30:16 +0300 Subject: [PATCH] Share UTF8 converters between coreclr and mono (#85558) * Share UTF8 converters between coreclr and mono - v1 * Revert "Share UTF8 converters between coreclr and mono - v1" This reverts commit f9845ac6f53dc95fb747eb21351dfa9412397217. * Share UTF8 converters between coreclr and mono - v2 * Remove C++ runtime dependency * Initial C++ to C conversion * Delete unused macros * Fix custom alloc in mono * Error on invalid sequences when caller requested * Remove count from convert APIs --- src/coreclr/inc/utilcode.h | 179 -- src/coreclr/pal/src/CMakeLists.txt | 2 +- src/coreclr/pal/src/include/pal/utf8.h | 52 - src/coreclr/pal/src/locale/unicode.cpp | 31 +- src/coreclr/pal/src/locale/utf8.cpp | 2937 ------------------------ src/coreclr/vm/rtlfunctions.cpp | 2 +- src/mono/mono/eglib/CMakeLists.txt | 9 +- src/mono/mono/eglib/giconv.c | 559 +---- src/mono/mono/eglib/glib.h | 4 - src/mono/mono/eglib/test/utf8.c | 6 +- src/mono/mono/metadata/object.c | 4 +- src/mono/mono/mini/CMakeLists.txt | 2 +- src/native/minipal/utf8.c | 2149 +++++++++++++++++ src/native/minipal/utf8.h | 75 + 14 files changed, 2378 insertions(+), 3633 deletions(-) delete mode 100644 src/coreclr/pal/src/include/pal/utf8.h delete mode 100644 src/coreclr/pal/src/locale/utf8.cpp create mode 100644 src/native/minipal/utf8.c create mode 100644 src/native/minipal/utf8.h diff --git a/src/coreclr/inc/utilcode.h b/src/coreclr/inc/utilcode.h index a332a6ccd6692..bc84e71644c9d 100644 --- a/src/coreclr/inc/utilcode.h +++ b/src/coreclr/inc/utilcode.h @@ -185,15 +185,6 @@ typedef LPSTR LPUTF8; // given and ANSI String, copy it into a wide buffer. // be careful about scoping when using this macro! // -// how to use the below two macros: -// -// ... -// LPSTR pszA; -// pszA = MyGetAnsiStringRoutine(); -// MAKE_WIDEPTR_FROMANSI(pwsz, pszA); -// MyUseWideStringRoutine(pwsz); -// ... -// // similarily for MAKE_ANSIPTR_FROMWIDE. note that the first param does not // have to be declared, and no clean up must be done. // @@ -211,25 +202,6 @@ typedef LPSTR LPUTF8; #define MAKE_TRANSLATIONFAILED ThrowWin32(ERROR_NO_UNICODE_TRANSLATION) #endif -// This version throws on conversion errors (ie, no best fit character -// mapping to characters that look similar, and no use of the default char -// ('?') when printing out unrepresentable characters. Use this method for -// most development in the EE, especially anything like metadata or class -// names. See the BESTFIT version if you're printing out info to the console. -#define MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, codepage) \ - int __l##ptrname = (int)u16_strlen(widestr); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - __l##ptrname = (int)((__l##ptrname + 1) * 2 * sizeof(char)); \ - CQuickBytes __CQuickBytes##ptrname; \ - __CQuickBytes##ptrname.AllocThrows(__l##ptrname); \ - BOOL __b##ptrname; \ - DWORD __cBytes##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, -1, (LPSTR)__CQuickBytes##ptrname.Ptr(), __l##ptrname, NULL, &__b##ptrname); \ - if (__b##ptrname || (__cBytes##ptrname == 0 && (widestr[0] != W('\0')))) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr() - // This version does best fit character mapping and also allows the use // of the default char ('?') for any Unicode character that isn't // representable. This is reasonable for writing to the console, but @@ -247,40 +219,6 @@ typedef LPSTR LPUTF8; } \ LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr() -// Use for anything critical other than output to console, where weird -// character mappings are unacceptable. -#define MAKE_ANSIPTR_FROMWIDE(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, CP_ACP) - -// Use for output to the console. -#define MAKE_ANSIPTR_FROMWIDE_BESTFIT(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE_BESTFIT(ptrname, widestr, CP_ACP) - -#define MAKE_WIDEPTR_FROMANSI(ptrname, ansistr) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - LPWSTR ptrname = (LPWSTR) __qb##ptrname.AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \ - if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) == 0) { \ - MAKE_TRANSLATIONFAILED; \ - } - -#define MAKE_WIDEPTR_FROMANSI_NOTHROW(ptrname, ansistr) \ - CQuickBytes __qb##ptrname; \ - LPWSTR ptrname = 0; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \ - if (__l##ptrname <= MAKE_MAX_LENGTH) { \ - ptrname = (LPWSTR) __qb##ptrname.AllocNoThrow((__l##ptrname+1)*sizeof(WCHAR)); \ - if (ptrname) { \ - if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) != 0) { \ - ptrname[__l##ptrname] = 0; \ - } else { \ - ptrname = 0; \ - } \ - } \ - } - #define MAKE_UTF8PTR_FROMWIDE(ptrname, widestr) CQuickBytes _##ptrname; _##ptrname.ConvertUnicode_Utf8(widestr); LPSTR ptrname = (LPSTR) _##ptrname.Ptr(); #define MAKE_UTF8PTR_FROMWIDE_NOTHROW(ptrname, widestr) \ @@ -312,22 +250,8 @@ typedef LPSTR LPUTF8; } \ } \ -#define MAKE_WIDEPTR_FROMUTF8N(ptrname, utf8str, n8chrs) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszMultiByteToWideChar(CP_UTF8, 0, utf8str, n8chrs, 0, 0); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - LPWSTR ptrname = (LPWSTR) __qb##ptrname .AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \ - if (0==WszMultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8str, n8chrs, ptrname, __l##ptrname)) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - - #define MAKE_WIDEPTR_FROMUTF8(ptrname, utf8str) CQuickBytes _##ptrname; _##ptrname.ConvertUtf8_Unicode(utf8str); LPCWSTR ptrname = (LPCWSTR) _##ptrname.Ptr(); - #define MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, n8chrs) \ CQuickBytes __qb##ptrname; \ int __l##ptrname; \ @@ -346,42 +270,10 @@ typedef LPSTR LPUTF8; #define MAKE_WIDEPTR_FROMUTF8_NOTHROW(ptrname, utf8str) MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, -1) -// This method takes the number of characters -#define MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, codepage) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, NULL, 0, NULL, NULL); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \ - BOOL __b##ptrname; \ - DWORD _pCnt = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, ptrname, __l##ptrname, NULL, &__b##ptrname); \ - if (__b##ptrname || (_pCnt == 0 && _nCharacters > 0)) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - -#define MAKE_MULTIBYTE_FROMWIDEN_BESTFIT(ptrname, widestr, _nCharacters, _pCnt, codepage) \ - CQuickBytes __qb##ptrname; \ - int __l##ptrname; \ - __l##ptrname = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, NULL, 0, NULL, NULL); \ - if (__l##ptrname > MAKE_MAX_LENGTH) \ - MAKE_TOOLONGACTION; \ - ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \ - DWORD _pCnt = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, ptrname, __l##ptrname, NULL, NULL); \ - if (_pCnt == 0 && _nCharacters > 0) { \ - MAKE_TRANSLATIONFAILED; \ - } \ - ptrname[__l##ptrname] = 0; - -#define MAKE_ANSIPTR_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt) \ - MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, CP_ACP) - const SIZE_T MaxSigned32BitDecString = ARRAY_SIZE("-2147483648") - 1; const SIZE_T MaxUnsigned32BitDecString = ARRAY_SIZE("4294967295") - 1; const SIZE_T MaxIntegerDecHexString = ARRAY_SIZE("-9223372036854775808") - 1; -const SIZE_T Max16BitHexString = ARRAY_SIZE("1234") - 1; const SIZE_T Max32BitHexString = ARRAY_SIZE("12345678") - 1; const SIZE_T Max64BitHexString = ARRAY_SIZE("1234567812345678") - 1; @@ -410,77 +302,6 @@ inline WCHAR* FormatInteger(WCHAR* str, size_t strCount, const char* fmt, I v) return str; } -inline -LPWSTR DuplicateString( - LPCWSTR wszString, - size_t cchString) -{ - STATIC_CONTRACT_NOTHROW; - - LPWSTR wszDup = NULL; - if (wszString != NULL) - { - wszDup = new (nothrow) WCHAR[cchString + 1]; - if (wszDup != NULL) - { - wcscpy_s(wszDup, cchString + 1, wszString); - } - } - return wszDup; -} - -inline -LPWSTR DuplicateString( - LPCWSTR wszString) -{ - STATIC_CONTRACT_NOTHROW; - - if (wszString != NULL) - { - return DuplicateString(wszString, u16_strlen(wszString)); - } - else - { - return NULL; - } -} - -void DECLSPEC_NORETURN ThrowOutOfMemory(); - -inline -LPWSTR DuplicateStringThrowing( - LPCWSTR wszString, - size_t cchString) -{ - STATIC_CONTRACT_THROWS; - - if (wszString == NULL) - return NULL; - - LPWSTR wszDup = DuplicateString(wszString, cchString); - if (wszDup == NULL) - ThrowOutOfMemory(); - - return wszDup; -} - -inline -LPWSTR DuplicateStringThrowing( - LPCWSTR wszString) -{ - STATIC_CONTRACT_THROWS; - - if (wszString == NULL) - return NULL; - - LPWSTR wszDup = DuplicateString(wszString); - if (wszDup == NULL) - ThrowOutOfMemory(); - - return wszDup; -} - - //***************************************************************************** // Placement new is used to new and object at an exact location. The pointer // is simply returned to the caller without actually using the heap. The diff --git a/src/coreclr/pal/src/CMakeLists.txt b/src/coreclr/pal/src/CMakeLists.txt index 4ec2c2b5783b9..804a71234031d 100644 --- a/src/coreclr/pal/src/CMakeLists.txt +++ b/src/coreclr/pal/src/CMakeLists.txt @@ -152,7 +152,7 @@ set(SOURCES loader/module.cpp locale/unicode.cpp locale/unicodedata.cpp - locale/utf8.cpp + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c map/common.cpp map/map.cpp map/virtual.cpp diff --git a/src/coreclr/pal/src/include/pal/utf8.h b/src/coreclr/pal/src/include/pal/utf8.h deleted file mode 100644 index fa417c0a021f7..0000000000000 --- a/src/coreclr/pal/src/include/pal/utf8.h +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*++ - - - -Module Name: - - include/pal/utf8.h - -Abstract: - Header file for UTF-8 conversion functions. - -Revision History: - - - ---*/ - -#ifndef _PAL_UTF8_H_ -#define _PAL_UTF8_H_ - -#include /* for WCHAR */ - -#ifdef __cplusplus -extern "C" -{ -#endif // __cplusplus - -/*++ -Function : - UTF8ToUnicode - - Convert a string from UTF-8 to UTF-16 (UCS-2) ---*/ -int UTF8ToUnicode(LPCSTR lpSrcStr, int cchSrc, LPWSTR lpDestStr, int cchDest, DWORD dwFlags); - - -/*++ -Function : - UnicodeToUTF8 - - Convert a string from UTF-16 (UCS-2) to UTF-8 ---*/ -int UnicodeToUTF8(LPCWSTR lpSrcStr, int cchSrc, LPSTR lpDestStr, int cchDest); - -#ifdef __cplusplus -} -#endif // __cplusplus - -#endif /* _PAL_UTF8_H_ */ diff --git a/src/coreclr/pal/src/locale/unicode.cpp b/src/coreclr/pal/src/locale/unicode.cpp index f29eabc07d9be..8bfa58608e594 100644 --- a/src/coreclr/pal/src/locale/unicode.cpp +++ b/src/coreclr/pal/src/locale/unicode.cpp @@ -24,7 +24,7 @@ Revision History: #include "pal/palinternal.h" #include "pal/dbgmsg.h" #include "pal/file.h" -#include "pal/utf8.h" +#include #include "pal/cruntime.h" #include "pal/stackstring.hpp" #include "pal/unicodedata.h" @@ -253,16 +253,20 @@ MultiByteToWideChar( goto EXIT; } - // Use UTF8ToUnicode on all systems, since it replaces - // invalid characters and Core Foundation doesn't do that. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - if (cbMultiByte <= -1) + if (cbMultiByte < 0) + cbMultiByte = strlen(lpMultiByteStr) + 1; + + if (!lpWideCharStr || cchWideChar == 0) + retval = minipal_get_length_utf8_to_utf16(lpMultiByteStr, cbMultiByte, dwFlags); + + if (lpWideCharStr) { - cbMultiByte = strlen(lpMultiByteStr) + 1; + if (cchWideChar == 0) cchWideChar = retval; + retval = minipal_convert_utf8_to_utf16(lpMultiByteStr, cbMultiByte, (CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags); } - retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags); goto EXIT; } @@ -338,15 +342,20 @@ WideCharToMultiByte( defaultChar = *lpDefaultChar; } - // Use UnicodeToUTF8 on all systems because we use - // UTF8ToUnicode in MultiByteToWideChar() on all systems. if (CodePage == CP_UTF8 || CodePage == CP_ACP) { - if (cchWideChar == -1) - { + if (cchWideChar < 0) cchWideChar = PAL_wcslen(lpWideCharStr) + 1; + + if (!lpMultiByteStr || cbMultiByte == 0) + retval = minipal_get_length_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags); + + if (lpMultiByteStr) + { + if (cbMultiByte == 0) cbMultiByte = retval; + retval = minipal_convert_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte, dwFlags); } - retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte); + goto EXIT; } diff --git a/src/coreclr/pal/src/locale/utf8.cpp b/src/coreclr/pal/src/locale/utf8.cpp deleted file mode 100644 index f07c69ff7e15f..0000000000000 --- a/src/coreclr/pal/src/locale/utf8.cpp +++ /dev/null @@ -1,2937 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -/*++ - -Module Name: - - unicode/utf8.c - -Abstract: - Functions to encode and decode UTF-8 strings. This is a port of the C# version from Utf8Encoding.cs. - -Revision History: - ---*/ - -#include "pal/utf8.h" -#include "pal/malloc.hpp" - -using namespace CorUnix; - -#define FASTLOOP - -struct CharUnicodeInfo -{ - static const WCHAR HIGH_SURROGATE_START = 0xd800; - static const WCHAR HIGH_SURROGATE_END = 0xdbff; - static const WCHAR LOW_SURROGATE_START = 0xdc00; - static const WCHAR LOW_SURROGATE_END = 0xdfff; -}; - -struct Char -{ - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR c) - { - return (c & 0xFC00) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR c) - { - return (c & 0xFC00) == CharUnicodeInfo::LOW_SURROGATE_START; - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR c) - { - return (c & 0xF800) == CharUnicodeInfo::HIGH_SURROGATE_START; - } - - // Test if the wide character is a high surrogate - static bool IsHighSurrogate(const WCHAR* s, int index) - { - return IsHighSurrogate(s[index]); - } - - // Test if the wide character is a low surrogate - static bool IsLowSurrogate(const WCHAR* s, int index) - { - return IsLowSurrogate(s[index]); - } - - // Test if the wide character is a surrogate half - static bool IsSurrogate(const WCHAR* s, int index) - { - return IsSurrogate(s[index]); - } -}; - -class ArgumentException -{ - -public: - ArgumentException(LPCSTR message) - { - } - - ArgumentException(LPCSTR message, LPCSTR argName) - { - } -}; - -class ArgumentNullException : public ArgumentException -{ -public: - ArgumentNullException(LPCSTR argName) - : ArgumentException("Argument is NULL", argName) - { - - } -}; - -class ArgumentOutOfRangeException : public ArgumentException -{ -public: - ArgumentOutOfRangeException(LPCSTR argName, LPCSTR message) - : ArgumentException(message, argName) - { - - } -}; - -class InsufficientBufferException : public ArgumentException -{ -public: - InsufficientBufferException(LPCSTR message, LPCSTR argName) - : ArgumentException(message, argName) - { - - } -}; - -class Contract -{ -public: - static void Assert(bool cond, LPCSTR str) - { - if (!cond) - { - throw ArgumentException(str); - } - } - - static void EndContractBlock() - { - } -}; - -class DecoderFallbackException : public ArgumentException -{ - BYTE *bytesUnknown; - int index; - -public: - DecoderFallbackException( - LPCSTR message, BYTE bytesUnknown[], int index) : ArgumentException(message) - { - this->bytesUnknown = bytesUnknown; - this->index = index; - } - - BYTE *BytesUnknown() - { - return (bytesUnknown); - } - - int GetIndex() - { - return index; - } -}; - -class DecoderFallbackBuffer; - -class DecoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() = 0; - - // Maximum number of characters that this instance of this fallback could return - - virtual int GetMaxCharCount() = 0; -}; - -class DecoderReplacementFallback : public DecoderFallback -{ - // Our variables - WCHAR strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - DecoderReplacementFallback() : DecoderReplacementFallback(W("?")) - { - } - - DecoderReplacementFallback(const WCHAR* replacement) - { - // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - WCHAR* GetDefaultString() - { - return strDefault; - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class DecoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // internal methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that's incorrect - -public: - virtual ~DecoderFallbackBuffer() = default; - - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) = 0; - - // Get next character - virtual WCHAR GetNextChar() = 0; - - //Back up a character - virtual bool MovePrevious() = 0; - - // How many chars left in this fallback? - virtual int GetRemaining() = 0; - - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (WCHAR)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - BYTE* byteStart; - WCHAR* charEnd; - - // Internal reset - void InternalReset() - { - byteStart = nullptr; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(BYTE* byteStart, WCHAR* charEnd) - { - this->byteStart = byteStart; - this->charEnd = charEnd; - } - - // Fallback the current byte by sticking it into the remaining char buffer. - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our DecoderNLS here too (except we don't). - // Returns true if we are successful, false if we can't fallback the character (no buffer space) - // So caller needs to throw buffer space if return false. - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - // Don't touch ref chars unless we succeed - virtual bool InternalFallback(BYTE bytes[], BYTE* pBytes, WCHAR** chars, int size) - { - - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); - - // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) - { - // Copy the chars to our output - WCHAR ch; - WCHAR* charTemp = *chars; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = true; - } - else - { - // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = false; - } - } - - if (charTemp >= charEnd) - { - // No buffer space - return false; - } - - *(charTemp++) = ch; - } - - // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - - // Now we aren't going to be false, so its OK to update chars - *chars = charTemp; - } - - return true; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) - // Right now this has both bytes[] and BYTE* bytes, since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - - Contract::Assert(byteStart != nullptr, "[DecoderFallback.InternalFallback]Used InternalFallback without calling InternalInitialize"); - - // See if there's a fallback character and we have an output buffer then copy our string. - if (this->Fallback(bytes, (int)(pBytes - byteStart - size), size)) - { - int count = 0; - - WCHAR ch; - bool bHighSurrogate = false; - while ((ch = GetNextChar()) != 0) - { - // Make sure no mixed up surrogates - if (Char::IsSurrogate(ch)) - { - if (Char::IsHighSurrogate(ch)) - { - // High Surrogate - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = true; - } - else - { - // Low surrogate - if (!bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - bHighSurrogate = false; - } - } - - count++; - } - - // Need to make sure that bHighSurrogate isn't true - if (bHighSurrogate) - throw ArgumentException("String 'chars' contains invalid Unicode code points."); - - return count; - } - - // If no fallback return 0 - return 0; - } - - // private helper methods - void ThrowLastBytesRecursive(BYTE bytesUnknown[]) - { - throw ArgumentException("Recursive fallback not allowed"); - } -}; - -class DecoderReplacementFallbackBuffer : public DecoderFallbackBuffer -{ - // Store our default string - WCHAR strDefault[2]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; - -public: - // Construction - DecoderReplacementFallbackBuffer(DecoderReplacementFallback* fallback) - { - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); - } - - // Fallback Methods - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) - { - // We expect no previous fallback in our buffer - // We can't call recursively but others might (note, we don't test on last char!!!) - if (fallbackCount >= 1) - { - ThrowLastBytesRecursive(bytesUnknown); - } - - // Go ahead and get our fallback - if (strDefaultLength == 0) - return false; - - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return true; - } - - virtual WCHAR GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = -1; - byteStart = nullptr; - } - - // This version just counts the fallback and doesn't actually copy anything. - virtual int InternalFallback(BYTE bytes[], BYTE* pBytes, int size) - // Right now this has both bytes and bytes[], since we might have extra bytes, hence the - // array, and we might need the index, hence the byte* - { - // return our replacement string Length - return strDefaultLength; - } -}; - -class DecoderExceptionFallbackBuffer : public DecoderFallbackBuffer -{ -public: - DecoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(BYTE bytesUnknown[], int index, int size) - { - throw DecoderFallbackException( - "Unable to translate UTF-8 character to Unicode", bytesUnknown, index); - } - - virtual WCHAR GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } - -}; - -class DecoderExceptionFallback : public DecoderFallback -{ - // Construction -public: - DecoderExceptionFallback() - { - } - - virtual DecoderFallbackBuffer* CreateFallbackBuffer() - { - return InternalNew(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -DecoderFallbackBuffer* DecoderReplacementFallback::CreateFallbackBuffer() -{ - return InternalNew(this); -} - -class EncoderFallbackException : public ArgumentException -{ - WCHAR charUnknown; - WCHAR charUnknownHigh; - WCHAR charUnknownLow; - int index; - -public: - EncoderFallbackException( - LPCSTR message, WCHAR charUnknown, int index) : ArgumentException(message) - { - this->charUnknown = charUnknown; - this->index = index; - } - - EncoderFallbackException( - LPCSTR message, WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) : ArgumentException(message) - { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); - - this->charUnknownHigh = charUnknownHigh; - this->charUnknownLow = charUnknownLow; - this->index = index; - } - - WCHAR GetCharUnknown() - { - return (charUnknown); - } - - WCHAR GetCharUnknownHigh() - { - return (charUnknownHigh); - } - - WCHAR GetCharUnknownLow() - { - return (charUnknownLow); - } - - int GetIndex() - { - return index; - } - - // Return true if the unknown character is a surrogate pair. - bool IsUnknownSurrogate() - { - return (charUnknownHigh != '\0'); - } -}; - -class EncoderFallbackBuffer; - -class EncoderFallback -{ -public: - - // Fallback - // - // Return the appropriate unicode string alternative to the character that need to fall back. - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() = 0; - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() = 0; -}; - -class EncoderReplacementFallback : public EncoderFallback -{ - // Our variables - WCHAR strDefault[2]; - int strDefaultLength; - -public: - // Construction. Default replacement fallback uses no best fit and ? replacement string - EncoderReplacementFallback() : EncoderReplacementFallback(W("?")) - { - } - - EncoderReplacementFallback(const WCHAR* replacement) - { - // Must not be null - if (replacement == nullptr) - throw ArgumentNullException("replacement"); - Contract::EndContractBlock(); - - // Make sure it doesn't have bad surrogate pairs - bool bFoundHigh = false; - int replacementLength = PAL_wcslen((const WCHAR *)replacement); - for (int i = 0; i < replacementLength; i++) - { - // Found a surrogate? - if (Char::IsSurrogate(replacement, i)) - { - // High or Low? - if (Char::IsHighSurrogate(replacement, i)) - { - // if already had a high one, stop - if (bFoundHigh) - break; // break & throw at the bFoundHIgh below - bFoundHigh = true; - } - else - { - // Low, did we have a high? - if (!bFoundHigh) - { - // Didn't have one, make if fail when we stop - bFoundHigh = true; - break; - } - - // Clear flag - bFoundHigh = false; - } - } - // If last was high we're in trouble (not surrogate so not low surrogate, so break) - else if (bFoundHigh) - break; - } - if (bFoundHigh) - throw ArgumentException("String 'replacement' contains invalid Unicode code points.", "replacement"); - - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), replacement); - strDefaultLength = replacementLength; - } - - WCHAR* GetDefaultString() - { - return strDefault; - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer(); - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return strDefaultLength; - } -}; - -class EncoderFallbackBuffer -{ - friend class UTF8Encoding; - // Most implementations will probably need an implementation-specific constructor - - // Public methods that cannot be overridden that let us do our fallback thing - // These wrap the internal methods so that we can check for people doing stuff that is incorrect - -public: - virtual ~EncoderFallbackBuffer() = default; - - virtual bool Fallback(WCHAR charUnknown, int index) = 0; - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) = 0; - - // Get next character - virtual WCHAR GetNextChar() = 0; - - // Back up a character - virtual bool MovePrevious() = 0; - - // How many chars left in this fallback? - virtual int GetRemaining() = 0; - - // Not sure if this should be public or not. - // Clear the buffer - virtual void Reset() - { - while (GetNextChar() != (WCHAR)0); - } - - // Internal items to help us figure out what we're doing as far as error messages, etc. - // These help us with our performance and messages internally -protected: - WCHAR* charStart; - WCHAR* charEnd; - bool setEncoder; - bool bUsedEncoder; - bool bFallingBack = false; - int iRecursionCount = 0; - static const int iMaxRecursion = 250; - - // Internal Reset - // For example, what if someone fails a conversion and wants to reset one of our fallback buffers? - void InternalReset() - { - charStart = nullptr; - bFallingBack = false; - iRecursionCount = 0; - Reset(); - } - - // Set the above values - // This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. - void InternalInitialize(WCHAR* charStart, WCHAR* charEnd, bool setEncoder) - { - this->charStart = charStart; - this->charEnd = charEnd; - this->setEncoder = setEncoder; - this->bUsedEncoder = false; - this->bFallingBack = false; - this->iRecursionCount = 0; - } - - WCHAR InternalGetNextChar() - { - WCHAR ch = GetNextChar(); - bFallingBack = (ch != 0); - if (ch == 0) iRecursionCount = 0; - return ch; - } - - // Fallback the current character using the remaining buffer and encoder if necessary - // This can only be called by our encodings (other have to use the public fallback methods), so - // we can use our EncoderNLS here too. - // setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount - // - // Note that this could also change the contents of this->encoder, which is the same - // object that the caller is using, so the caller could mess up the encoder for us - // if they aren't careful. - virtual bool InternalFallback(WCHAR ch, WCHAR** chars) - { - // Shouldn't have null charStart - Contract::Assert(charStart != nullptr, - "[EncoderFallback.InternalFallbackBuffer]Fallback buffer is not initialized"); - - // Get our index, remember chars was preincremented to point at next char, so have to -1 - int index = (int)(*chars - charStart) - 1; - - // See if it was a high surrogate - if (Char::IsHighSurrogate(ch)) - { - // See if there's a low surrogate to go with it - if (*chars >= this->charEnd) - { - // Nothing left in input buffer - // No input, return 0 - } - else - { - // Might have a low surrogate - WCHAR cNext = **chars; - if (Char::IsLowSurrogate(cNext)) - { - // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive(ch, cNext); - - // Next is a surrogate, add it as surrogate pair, and increment chars - (*chars)++; - bFallingBack = Fallback(ch, cNext, index); - return bFallingBack; - } - - // Next isn't a low surrogate, just fallback the high surrogate - } - } - - // If already falling back then fail - if (bFallingBack && iRecursionCount++ > iMaxRecursion) - ThrowLastCharRecursive((int)ch); - - // Fall back our char - bFallingBack = Fallback(ch, index); - - return bFallingBack; - } - - // private helper methods - void ThrowLastCharRecursive(WCHAR highSurrogate, WCHAR lowSurrogate) - { - // Throw it, using our complete character - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - - void ThrowLastCharRecursive(int utf32Char) - { - throw ArgumentException("Recursive fallback not allowed", "chars"); - } - -}; - -class EncoderReplacementFallbackBuffer : public EncoderFallbackBuffer -{ - // Store our default string - WCHAR strDefault[4]; - int strDefaultLength; - int fallbackCount = -1; - int fallbackIndex = -1; -public: - // Construction - EncoderReplacementFallbackBuffer(EncoderReplacementFallback* fallback) - { - // 2X in case we're a surrogate pair - wcscpy_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - wcscat_s(strDefault, ARRAY_SIZE(strDefault), fallback->GetDefaultString()); - strDefaultLength = 2 * PAL_wcslen((const WCHAR *)fallback->GetDefaultString()); - - } - - // Fallback Methods - virtual bool Fallback(WCHAR charUnknown, int index) - { - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - if (fallbackCount >= 1) - { - // If we're recursive we may still have something in our buffer that makes this a surrogate - if (Char::IsHighSurrogate(charUnknown) && fallbackCount >= 0 && - Char::IsLowSurrogate(strDefault[fallbackIndex + 1])) - ThrowLastCharRecursive(charUnknown, strDefault[fallbackIndex + 1]); - - // Nope, just one character - ThrowLastCharRecursive((int)charUnknown); - } - - // Go ahead and get our fallback - // Divide by 2 because we aren't a surrogate pair - fallbackCount = strDefaultLength / 2; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) - { - // Double check input surrogate pair - if (!Char::IsHighSurrogate(charUnknownHigh)) - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - - if (!Char::IsLowSurrogate(charUnknownLow)) - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - Contract::EndContractBlock(); - - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - if (fallbackCount >= 1) - ThrowLastCharRecursive(charUnknownHigh, charUnknownLow); - - // Go ahead and get our fallback - fallbackCount = strDefaultLength; - fallbackIndex = -1; - - return fallbackCount != 0; - } - - virtual WCHAR GetNextChar() - { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - fallbackCount--; - fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (fallbackCount == INT_MAX) - { - fallbackCount = -1; - return '\0'; - } - - // Now make sure its in the expected range - Contract::Assert(fallbackIndex < strDefaultLength && fallbackIndex >= 0, - "Index exceeds buffer range"); - - return strDefault[fallbackIndex]; - } - - virtual bool MovePrevious() - { - // Back up one, only if we just processed the last character (or earlier) - if (fallbackCount >= -1 && fallbackIndex >= 0) - { - fallbackIndex--; - fallbackCount++; - return true; - } - - // Return false 'cause we couldn't do it. - return false; - } - - // How many characters left to output? - virtual int GetRemaining() - { - // Our count is 0 for 1 character left. - return (fallbackCount < 0) ? 0 : fallbackCount; - } - - // Clear the buffer - virtual void Reset() - { - fallbackCount = -1; - fallbackIndex = 0; - charStart = nullptr; - bFallingBack = false; - } -}; - -class EncoderExceptionFallbackBuffer : public EncoderFallbackBuffer -{ -public: - EncoderExceptionFallbackBuffer() - { - } - - virtual bool Fallback(WCHAR charUnknown, int index) - { - // Fall back our char - throw EncoderFallbackException("Unable to translate Unicode character to UTF-8", charUnknown, index); - } - - virtual bool Fallback(WCHAR charUnknownHigh, WCHAR charUnknownLow, int index) - { - if (!Char::IsHighSurrogate(charUnknownHigh)) - { - throw ArgumentOutOfRangeException("charUnknownHigh", - "Argument out of range 0xD800..0xDBFF"); - } - if (!Char::IsLowSurrogate(charUnknownLow)) - { - throw ArgumentOutOfRangeException("charUnknownLow", - "Argument out of range 0xDC00..0xDFFF"); - } - Contract::EndContractBlock(); - - //int iTemp = Char::ConvertToUtf32(charUnknownHigh, charUnknownLow); - - // Fall back our char - throw EncoderFallbackException( - "Unable to translate Unicode character to UTF-8", charUnknownHigh, charUnknownLow, index); - } - - virtual WCHAR GetNextChar() - { - return 0; - } - - virtual bool MovePrevious() - { - // Exception fallback doesn't have anywhere to back up to. - return false; - } - - // Exceptions are always empty - virtual int GetRemaining() - { - return 0; - } -}; - -class EncoderExceptionFallback : public EncoderFallback -{ - // Construction -public: - EncoderExceptionFallback() - { - } - - virtual EncoderFallbackBuffer* CreateFallbackBuffer() - { - return InternalNew(); - } - - // Maximum number of characters that this instance of this fallback could return - virtual int GetMaxCharCount() - { - return 0; - } -}; - -EncoderFallbackBuffer* EncoderReplacementFallback::CreateFallbackBuffer() -{ - return InternalNew(this); -} - -class UTF8Encoding -{ - EncoderFallback* encoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - EncoderReplacementFallback encoderReplacementFallback; - EncoderExceptionFallback encoderExceptionFallback; - - DecoderFallback* decoderFallback; - // Instances of the two possible fallbacks. The constructor parameter - // determines which one to use. - DecoderReplacementFallback decoderReplacementFallback; - DecoderExceptionFallback decoderExceptionFallback; - - bool InRange(int c, int begin, int end) - { - return begin <= c && c <= end; - } - - size_t PtrDiff(WCHAR* ptr1, WCHAR* ptr2) - { - return ptr1 - ptr2; - } - - size_t PtrDiff(BYTE* ptr1, BYTE* ptr2) - { - return ptr1 - ptr2; - } - - void ThrowBytesOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output byte buffer is too small to contain the encoded data", "bytes"); - } - - void ThrowBytesOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an encoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowBytesOverflow(); - } - } - - void ThrowCharsOverflow() - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented a decoder fallback with a broken GetMaxCharCount - throw InsufficientBufferException("The output char buffer is too small to contain the encoded data", "chars"); - } - - void ThrowCharsOverflow(bool nothingEncoded) - { - // Special message to include fallback type in case fallback's GetMaxCharCount is broken - // This happens if user has implemented an decoder fallback with a broken GetMaxCharCount - if (nothingEncoded){ - ThrowCharsOverflow(); - } - } - - // During GetChars we had an invalid byte sequence - // pSrc is backed up to the start of the bad sequence if we didn't have room to - // fall it back. Otherwise pSrc remains where it is. - bool FallbackInvalidByteSequence(BYTE** pSrc, int ch, DecoderFallbackBuffer* fallback, WCHAR** pTarget) - { - // Get our byte[] - BYTE* pStart = *pSrc; - BYTE bytesUnknown[3]; - int size = GetBytesUnknown(pStart, ch, bytesUnknown); - - // Do the actual fallback - if (!fallback->InternalFallback(bytesUnknown, *pSrc, pTarget, size)) - { - // Oops, it failed, back up to pStart - *pSrc = pStart; - return false; - } - - // It worked - return true; - } - - int FallbackInvalidByteSequence(BYTE* pSrc, int ch, DecoderFallbackBuffer *fallback) - { - // Get our byte[] - BYTE bytesUnknown[3]; - int size = GetBytesUnknown(pSrc, ch, bytesUnknown); - - // Do the actual fallback - int count = fallback->InternalFallback(bytesUnknown, pSrc, size); - - // # of fallback chars expected. - // Note that we only get here for "long" sequences, and have already unreserved - // the count that we prereserved for the input bytes - return count; - } - - int GetBytesUnknown(BYTE* pSrc, int ch, BYTE* bytesUnknown) - { - int size; - - // See if it was a plain char - // (have to check >= 0 because we have all sorts of weird bit flags) - if (ch < 0x100 && ch >= 0) - { - pSrc--; - bytesUnknown[0] = (BYTE)ch; - size = 1; - } - // See if its an unfinished 2 byte sequence - else if ((ch & (SupplimentarySeq | ThreeByteSeq)) == 0) - { - pSrc--; - bytesUnknown[0] = (BYTE)((ch & 0x1F) | 0xc0); - size = 1; - } - // So now we're either 2nd byte of 3 or 4 byte sequence or - // we hit a non-trail byte or we ran out of space for 3rd byte of 4 byte sequence - // 1st check if its a 4 byte sequence - else if ((ch & SupplimentarySeq) != 0) - { - // 3rd byte of 4 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // 3rd byte of 4 byte sequence - pSrc -= 3; - bytesUnknown[0] = (BYTE)(((ch >> 12) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch >> 6) & 0x3F) | 0x80); - bytesUnknown[2] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 3; - } - else if ((ch & (FinalByte >> 12)) != 0) - { - // 2nd byte of a 4 byte sequence - pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x07) | 0xF0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 4th byte of a 4 byte sequence - pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x07) | 0xF0); - size = 1; - } - } - else - { - // 2nd byte of 3 byte sequence? - if ((ch & (FinalByte >> 6)) != 0) - { - // So its 2nd byte of a 3 byte sequence - pSrc -= 2; - bytesUnknown[0] = (BYTE)(((ch >> 6) & 0x0F) | 0xE0); - bytesUnknown[1] = (BYTE)(((ch)& 0x3F) | 0x80); - size = 2; - } - else - { - // 1st byte of a 3 byte sequence - pSrc--; - bytesUnknown[0] = (BYTE)(((ch)& 0x0F) | 0xE0); - size = 1; - } - } - - return size; - } - -public: - - UTF8Encoding(bool isThrowException) - : encoderReplacementFallback(W("\xFFFD")), decoderReplacementFallback(W("\xFFFD")) - { - if (isThrowException) - { - encoderFallback = &encoderExceptionFallback; - decoderFallback = &decoderExceptionFallback; - } - else - { - encoderFallback = &encoderReplacementFallback; - decoderFallback = &decoderReplacementFallback; - } - } - - // These are bitmasks used to maintain the state in the decoder. They occupy the higher bits - // while the actual character is being built in the lower bits. They are shifted together - // with the actual bits of the character. - - // bits 30 & 31 are used for pending bits fixup - const int FinalByte = 1 << 29; - const int SupplimentarySeq = 1 << 28; - const int ThreeByteSeq = 1 << 27; - - int GetCharCount(BYTE* bytes, int count) - { - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr"); - Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0"); - - // Initialize stuff - BYTE *pSrc = bytes; - BYTE *pEnd = pSrc + count; - - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - int charCount = count; - int ch = 0; - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) { - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte (of 4 byte supplimentary) - nothing to do - continue; - } - - // 2nd byte, check for non-shortest form of supplimentary char and the valid - // supplimentary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) { - charCount--; - } - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) { - // Unexpected trail byte - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; - } - else { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - -#ifdef FASTLOOP - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) { - // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - BYTE *pStop = pSrc + availableBytes - 7; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - - // get pSrc 2-byte aligned - if (((size_t)pSrc & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - } - - // get pSrc 4-byte aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *(USHORT*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - pSrc += 2; - } - - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; - - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - pSrc += 8; - } - break; - -#if BIGENDIAN - LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: - LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN - pSrc++; - if (ch <= 0x7F) { - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) { - goto BadLongCode; - } - } - - // extra byte - charCount--; - } -#endif // FASTLOOP - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - // May have a problem if we have to flush - if (ch != 0) - { - // We were already adjusting for these, so need to unadjust - charCount += (ch >> 30); - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, nullptr); - } - charCount += FallbackInvalidByteSequence(pSrc, ch, fallback); - } - - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check m_throwOnOverflow for count) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end"); - - InternalDelete(fallback); - - return charCount; - - } - - int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount) - { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr"); - - BYTE *pSrc = bytes; - WCHAR *pTarget = chars; - - BYTE *pEnd = pSrc + byteCount; - WCHAR *pAllocatedBufferEnd = pTarget + charCount; - - int ch = 0; - - DecoderFallbackBuffer *fallback = nullptr; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - break; - } - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; - } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) { - // Not at last byte yet - Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0, - "[UTF8Encoding.GetChars]Invariant volation"); - - if ((ch & SupplimentarySeq) != 0) { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) { - goto InvalidByteSequence; - } - } - else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) { - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)))); - pTarget++; - - ch = (ch & 0x3FF) + - (int)(CharUnicodeInfo::LOW_SURROGATE_START); - } - } - - goto EncodeChar; - - InvalidByteSequence: - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget)) - { - // Ran out of buffer space - // Need to throw an exception? - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback"); - fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); - ch = 0; - break; - } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array"); - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) { - goto InvalidByteSequence; - } - - // start a new long code - if ((ch & 0x20) != 0) { - if ((ch & 0x10) != 0) { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) { - ch |= 0xf0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - ch |= 0xc0; - goto InvalidByteSequence; - } - - ch |= (FinalByte >> 6); - } - continue; - } - - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) - { - if (ch > 0x7ff) - { - if (ch >= CharUnicodeInfo::LOW_SURROGATE_START && - ch <= CharUnicodeInfo::LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) - { - pSrc--; // It was 4 bytes, nothing was stored - } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - // Throw that we don't have enough room (pSrc could be < chars if we had started to process - // a 4 byte sequence already) - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]"); - ThrowCharsOverflow(pTarget == chars); - - // Don't store ch in decoder, we already backed up to its start - ch = 0; - - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (WCHAR)ch; - pTarget++; - -#ifdef FASTLOOP - int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget); - int availableBytes = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (WCHAR)ch; - pTarget++; - } - // we are done - ch = 0; - break; - } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) { - availableBytes = availableChars; - } - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - WCHAR *pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (WCHAR)ch; - pTarget++; - - // get pSrc to be 2-byte aligned - if ((((size_t)pSrc) & 0x1) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (WCHAR)ch; - pTarget++; - } - - // get pSrc to be 4-byte aligned - if ((((size_t)pSrc) & 0x2) != 0) { - ch = *(USHORT*)pSrc; - if ((ch & 0x8080) != 0) { - goto LongCodeWithMask16; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (WCHAR)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)(ch & 0x7F); - pTarget += 2; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - pTarget += 2; -#endif // BIGENDIAN - } - - // Run 8 characters at a time! - while (pTarget < pStop) { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) { - goto LongCodeWithMask32; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (WCHAR)((ch >> 24) & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 3) = (WCHAR)(ch & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 7) = (WCHAR)(chb & 0x7F); - pTarget += 8; -#else // BIGENDIAN - *pTarget = (WCHAR)(ch & 0x7F); - *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F); - *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F); - *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (WCHAR)(chb & 0x7F); - *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F); - *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F); - *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F); - pTarget += 8; -#endif // BIGENDIAN - } - break; - -#if BIGENDIAN - LongCodeWithMask32 : - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); - LongCodeWithMask16: - ch = (int)(((uint)ch) >> 8); -#else // BIGENDIAN - LongCodeWithMask32: - LongCodeWithMask16: - ch &= 0xFF; -#endif // BIGENDIAN - pSrc++; - if (ch <= 0x7F) { - *pTarget = (WCHAR)ch; - pTarget++; - continue; - } - - LongCode: - int chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) { - goto BadLongCode; - } - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) + - (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))); - pTarget++; - - ch = (ch & 0x3FF) + - (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } - } - else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) { - goto BadLongCode; - } - ch = (ch << 6) | chc; - } - - *pTarget = (WCHAR)ch; - pTarget++; - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; - } -#endif // FASTLOOP - - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd"); - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - if (ch != 0) - { - // Have to do fallback for invalid bytes - if (fallback == nullptr) - { - fallback = decoderFallback->CreateFallbackBuffer(); - fallback->InternalInitialize(bytes, pAllocatedBufferEnd); - } - - // This'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence(pSrc, ch, fallback)) - { - Contract::Assert(pSrc >= bytes || pTarget == chars, - "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing"); - - // Ran out of buffer space - // Need to throw an exception? - fallback->InternalReset(); - ThrowCharsOverflow(pTarget == chars); - } - Contract::Assert(pSrc >= bytes, - "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array"); - ch = 0; - } - - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check m_throwOnOverflow for chars) - Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0, - "[UTF8Encoding.GetChars]Expected empty fallback buffer at end"); - - InternalDelete(fallback); - - return PtrDiff(pTarget, chars); - } - - int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount) - { - Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr"); - Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0"); - Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0"); - Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr"); - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - BYTE *pTarget = bytes; - - WCHAR *pEnd = pSrc + charCount; - BYTE *pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - - // assume that JIT will enregister pSrc, pTarget and ch - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) { - - if (ch == 0) { - // Check if there's anything left to get out of the fallback buffer - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; - if (ch > 0) { - goto ProcessChar; - } - } - else { - // Case of leftover surrogates in the fallback buffer - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - int cha = ch; - - ch = fallbackBuffer->InternalGetNextChar(); - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - break; - } - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0) - goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) { - // We have a high surrogate left over from a previous loop. - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = cha + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) - { - ch = fallbackBuffer->InternalGetNextChar(); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); - - // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, pEnd, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) { - if (ch > 0x7FF) { - if (ch > 0xFFFF) { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) - { - fallbackBuffer->MovePrevious(); // Didn't use this fallback char - if (ch > 0xFFFF) - fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - Contract::Assert(pSrc >= chars || pTarget == bytes, - "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room."); - ThrowBytesOverflow(pTarget == bytes); // Throw if we must - ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) - break; - } - - if (ch <= 0x7F) { - *pTarget = (BYTE)ch; - } - else { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) { - // 2 BYTE encoding - chb = (BYTE)(0xC0 | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) { - chb = (BYTE)(0xE0 | (ch >> 12)); - } - else - { - *pTarget = (BYTE)(0xF0 | (ch >> 18)); - pTarget++; - - chb = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (BYTE)chb; - pTarget++; - - chb = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (BYTE)chb; - pTarget++; - - *pTarget = (BYTE)0x80 | (ch & 0x3F); - } - pTarget++; - - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) - goto ProcessChar; - - int availableChars = PtrDiff(pEnd, pSrc); - int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) { - // we are hoping for 1 BYTE per char - if (availableBytes < availableChars) { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 BYTE per char - if (ch > 0x7F) - goto ProcessChar; - - *pTarget = (BYTE)ch; - pTarget++; - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 BYTE per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - WCHAR *pStop = pSrc + availableChars - 5; - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - } - - // Run 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) { - goto LongCodeWithMask; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - *pTarget = (BYTE)(ch >> 16); - *(pTarget + 1) = (BYTE)ch; - pSrc += 4; - *(pTarget + 2) = (BYTE)(chc >> 16); - *(pTarget + 3) = (BYTE)chc; - pTarget += 4; -#else // BIGENDIAN - *pTarget = (BYTE)ch; - *(pTarget + 1) = (BYTE)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (BYTE)chc; - *(pTarget + 3) = (BYTE)(chc >> 16); - pTarget += 4; -#endif // BIGENDIAN - } - continue; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; - - if (ch > 0x7F) { - goto LongCode; - } - *pTarget = (BYTE)ch; - pTarget++; - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - int chd; - if (ch <= 0x7FF) { - // 2 BYTE encoding - chd = 0xC0 | (ch >> 6); - } - else { - if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 3 BYTE encoding - chd = 0xE0 | (ch >> 12); - } - else - { - // 4 BYTE encoding - high surrogate + low surrogate - if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - // if (!IsLowSurrogate(chd)) { - if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - CharUnicodeInfo::LOW_SURROGATE_START - - (CharUnicodeInfo::HIGH_SURROGATE_START << 10)); - - *pTarget = (BYTE)(0xF0 | (ch >> 18)); - // pStop - this BYTE is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - pTarget++; - - chd = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (BYTE)chd; - pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too. - pTarget++; - - chd = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (BYTE)chd; - pStop--; // 2 BYTE sequence for 1 char so need pStop--. - pTarget++; - - *pTarget = (BYTE)(0x80 | (ch & 0x3F)); - // pStop - this BYTE is already included - pTarget++; - } - - Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd"); - -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - - InternalDelete(fallbackBuffer); - - return (int)(pTarget - bytes); - } - - int GetByteCount(WCHAR *chars, int count) - { - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - EncoderFallbackBuffer* fallbackBuffer = nullptr; - WCHAR *pSrc = chars; - WCHAR *pEnd = pSrc + count; - - // Start by assuming we have as many as count - int byteCount = count; - - int ch = 0; - - while (true) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) { - - if (ch == 0) { - // Unroll any fallback that happens at the end - ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0; - if (ch > 0) { - byteCount++; - goto ProcessChar; - } - } - else { - // Case of surrogates in the fallback. - if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - ch = fallbackBuffer->InternalGetNextChar(); - byteCount++; - - if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0){ - goto ProcessChar; - } - else { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) { - Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF, - "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture)); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - // if (IsLowSurrogate(cha)) { - if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - CharUnicodeInfo::LOW_SURROGATE_START - // - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackBuffer != nullptr) - { - ch = fallbackBuffer->InternalGetNextChar(); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (fallbackBuffer == nullptr) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackBuffer = encoderFallback->CreateFallbackBuffer(); - - // Set our internal fallback interesting things. - fallbackBuffer->InternalInitialize(chars, chars + count, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc); - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) { - if (ch > 0x7FF) { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) { - break; - } -#endif - -#ifdef FASTLOOP - // If still have fallback don't do fast loop - if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - int availableChars = PtrDiff(pEnd, pSrc); - - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) { - // try to get over the remainder of the ascii characters fast though - WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - goto ProcessChar; - } - - // we are done - break; - } - -#if WIN64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - WCHAR *pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & (int)0xFF800000) != 0) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - ch = (int)(((uint)ch) >> 16); -#else // BIGENDIAN - ch = (WCHAR)ch; -#endif // BIGENDIAN - pSrc++; - - if (ch <= 0x7F) { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) { - if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - ch > CharUnicodeInfo::HIGH_SURROGATE_END || - !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } -#endif // FASTLOOP - - // no pending char at this point - ch = 0; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) { - throw ArgumentException("Conversion buffer overflow."); - } -#endif - - Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0, - "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer"); - - InternalDelete(fallbackBuffer); - - return byteCount; - } - -}; - - -//////////////////////////////////////////////////////////////////////////// -// -// UTF8ToUnicode -// -// Maps a UTF-8 character string to its wide character string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UTF8ToUnicode( - LPCSTR lpSrcStr, - int cchSrc, - LPWSTR lpDestStr, - int cchDest, - DWORD dwFlags - ) -{ - int ret; - UTF8Encoding enc(dwFlags & MB_ERR_INVALID_CHARS); - try { - ret = enc.GetCharCount((BYTE*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetChars((BYTE*)lpSrcStr, cchSrc, (WCHAR*)lpDestStr, ret); - } - } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const DecoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; - } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; - } - return ret; -} - -//////////////////////////////////////////////////////////////////////////// -// -// UnicodeToUTF8 -// -// Maps a Unicode character string to its UTF-8 string counterpart. -// -//////////////////////////////////////////////////////////////////////////// - -int UnicodeToUTF8( - LPCWSTR lpSrcStr, - int cchSrc, - LPSTR lpDestStr, - int cchDest) -{ - int ret; - UTF8Encoding enc(false); - try{ - ret = enc.GetByteCount((WCHAR*)lpSrcStr, cchSrc); - if (cchDest){ - if (ret > cchDest){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - ret = 0; - } - enc.GetBytes((WCHAR*)lpSrcStr, cchSrc, (BYTE*)lpDestStr, ret); - } - } - catch (const InsufficientBufferException& e){ - SetLastError(ERROR_INSUFFICIENT_BUFFER); - return 0; - } - catch (const EncoderFallbackException& e){ - SetLastError(ERROR_NO_UNICODE_TRANSLATION); - return 0; - } - catch (const ArgumentException& e){ - SetLastError(ERROR_INVALID_PARAMETER); - return 0; - } - return ret; -} diff --git a/src/coreclr/vm/rtlfunctions.cpp b/src/coreclr/vm/rtlfunctions.cpp index 23f662b4d600a..f3f80338f3f8e 100644 --- a/src/coreclr/vm/rtlfunctions.cpp +++ b/src/coreclr/vm/rtlfunctions.cpp @@ -103,7 +103,7 @@ VOID InstallEEFunctionTable ( } else { - NewArrayHolder wzTempName(DuplicateStringThrowing(ssTempName.GetUnicode())); + NewArrayHolder wzTempName(ssTempName.GetCopyOfUnicodeString()); // publish result if (InterlockedCompareExchangeT(&wszModuleName, (LPWSTR)wzTempName, nullptr) == nullptr) diff --git a/src/mono/mono/eglib/CMakeLists.txt b/src/mono/mono/eglib/CMakeLists.txt index 3de4a9c83d2f5..09cf32eaa81ad 100644 --- a/src/mono/mono/eglib/CMakeLists.txt +++ b/src/mono/mono/eglib/CMakeLists.txt @@ -33,7 +33,12 @@ set(eglib_common_sources gspawn.c gfile.c gfile-posix.c - gutf8.c) + gutf8.c + ${CLR_SRC_NATIVE_DIR}/minipal/utf8.c) + +if(IS_BIG_ENDIAN) + set_source_files_properties("${CLR_SRC_NATIVE_DIR}/minipal/utf8.c" PROPERTIES COMPILE_FLAGS "-DBIGENDIAN=1") +endif() set(eglib_headers glib.h @@ -41,7 +46,7 @@ set(eglib_headers gmodule.h) if(HAVE_CLOCK_NANOSLEEP) -list(APPEND eglib_common_sources gclock-nanosleep.c) + list(APPEND eglib_common_sources gclock-nanosleep.c) endif() set(eglib_sources "${eglib_platform_sources};${eglib_common_sources}") diff --git a/src/mono/mono/eglib/giconv.c b/src/mono/mono/eglib/giconv.c index 664ad31bba258..8ae955c303fe2 100644 --- a/src/mono/mono/eglib/giconv.c +++ b/src/mono/mono/eglib/giconv.c @@ -28,132 +28,20 @@ #include #include "../utils/mono-errno.h" +#include + #ifdef _MSC_VER #define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE #else #define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline)) #endif - -#define UNROLL_DECODE_UTF8 0 -#define UNROLL_ENCODE_UTF8 0 - -static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf32le (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16be (gunichar c, char *outbuf, size_t outleft); - -static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf16le (gunichar c, char *outbuf, size_t outleft); - -static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_utf8 (gunichar c, char *outbuf, size_t outleft); - -static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar); -static int encode_latin1 (gunichar c, char *outbuf, size_t outleft); - #if G_BYTE_ORDER == G_LITTLE_ENDIAN -#define decode_utf32 decode_utf32le -#define encode_utf32 encode_utf32le #define decode_utf16 decode_utf16le -#define encode_utf16 encode_utf16le #else -#define decode_utf32 decode_utf32be -#define encode_utf32 encode_utf32be #define decode_utf16 decode_utf16be -#define encode_utf16 encode_utf16be #endif -/* - * Unicode encoders and decoders - */ - -static FORCE_INLINE (uint32_t) -read_uint32_endian (unsigned char *inptr, unsigned endian) -{ - if (endian == G_LITTLE_ENDIAN) - return (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0]; - return (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3]; -} - -static int -decode_utf32_endian (char *inbuf, size_t inleft, gunichar *outchar, unsigned endian) -{ - unsigned char *inptr = (unsigned char *) inbuf; - gunichar c; - - if (inleft < 4) { - mono_set_errno (EINVAL); - return -1; - } - - c = read_uint32_endian (inptr, endian); - - if (c >= 0xd800 && c < 0xe000) { - mono_set_errno (EILSEQ); - return -1; - } else if (c >= 0x110000) { - mono_set_errno (EILSEQ); - return -1; - } - - *outchar = c; - - return 4; -} - -static int -decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar) -{ - return decode_utf32_endian (inbuf, inleft, outchar, G_BIG_ENDIAN); -} - -static int -decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar) -{ - return decode_utf32_endian (inbuf, inleft, outchar, G_LITTLE_ENDIAN); -} - -static int -encode_utf32be (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - outptr[0] = (c >> 24) & 0xff; - outptr[1] = (c >> 16) & 0xff; - outptr[2] = (c >> 8) & 0xff; - outptr[3] = c & 0xff; - - return 4; -} - -static int -encode_utf32le (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - outptr[0] = c & 0xff; - outptr[1] = (c >> 8) & 0xff; - outptr[2] = (c >> 16) & 0xff; - outptr[3] = (c >> 24) & 0xff; - - return 4; -} - static FORCE_INLINE (uint16_t) read_uint16_endian (unsigned char *inptr, unsigned endian) { @@ -233,50 +121,6 @@ write_uint16_endian (unsigned char *outptr, uint16_t c, unsigned endian) outptr[1] = c & 0xff; } -static FORCE_INLINE (int) -encode_utf16_endian (gunichar c, char *outbuf, size_t outleft, unsigned endian) -{ - unsigned char *outptr = (unsigned char *) outbuf; - gunichar2 ch; - gunichar c2; - - if (c < 0x10000) { - if (outleft < 2) { - mono_set_errno (E2BIG); - return -1; - } - - write_uint16_endian (outptr, GUNICHAR_TO_UINT16 (c), endian); - return 2; - } else { - if (outleft < 4) { - mono_set_errno (E2BIG); - return -1; - } - - c2 = c - 0x10000; - - ch = (gunichar2) ((c2 >> 10) + 0xd800); - write_uint16_endian (outptr, ch, endian); - - ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00); - write_uint16_endian (outptr + 2, ch, endian); - return 4; - } -} - -static int -encode_utf16be (gunichar c, char *outbuf, size_t outleft) -{ - return encode_utf16_endian (c, outbuf, outleft, G_BIG_ENDIAN); -} - -static int -encode_utf16le (gunichar c, char *outbuf, size_t outleft) -{ - return encode_utf16_endian (c, outbuf, outleft, G_LITTLE_ENDIAN); -} - static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar) { @@ -336,89 +180,6 @@ decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar) return GSIZE_TO_INT(n); } -static int -encode_utf8 (gunichar c, char *outbuf, size_t outleft) -{ - unsigned char *outptr = (unsigned char *) outbuf; - int base; - size_t n; - - if (c < 0x80) { - outptr[0] = GUNICHAR_TO_UINT8 (c); - return 1; - } else if (c < 0x800) { - base = 192; - n = 2; - } else if (c < 0x10000) { - base = 224; - n = 3; - } else if (c < 0x200000) { - base = 240; - n = 4; - } else if (c < 0x4000000) { - base = 248; - n = 5; - } else { - base = 252; - n = 6; - } - - if (outleft < n) { - mono_set_errno (E2BIG); - return -1; - } - -#if UNROLL_ENCODE_UTF8 - switch (n) { - case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6; - case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6; - case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6; - case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6; - case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6; - case 1: outptr[0] = c | base; - } -#else - for (size_t i = n - 1; i > 0; i--) { - outptr[i] = (c & 0x3f) | 0x80; - c >>= 6; - } - - outptr[0] = GUNICHAR_TO_UINT8 (c | base); -#endif - - return GSIZE_TO_INT(n); -} - -static int -decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar) -{ - *outchar = (unsigned char) *inbuf; - return 1; -} - -static int -encode_latin1 (gunichar c, char *outbuf, size_t outleft) -{ - if (outleft < 1) { - mono_set_errno (E2BIG); - return -1; - } - - if (c > 0xff) { - mono_set_errno (EILSEQ); - return -1; - } - - *outbuf = (char) c; - - return 1; -} - - -/* - * Simple conversion API - */ - static gpointer error_quark = (gpointer)"ConvertError"; gpointer @@ -426,9 +187,6 @@ g_convert_error_quark (void) { return error_quark; } -/* - * Unicode conversion - */ /** * An explanation of the conversion can be found at: @@ -559,162 +317,114 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) return outbuf; } -static gunichar2 * -eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, gboolean replace_invalid_codepoints, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +static FORCE_INLINE (void) +map_error(GError **err) { - gunichar2 *outbuf, *outptr; - size_t outlen = 0; - size_t inleft; - char *inptr; - gunichar c; - int u, n; - - g_return_val_if_fail (str != NULL, NULL); - - if (len < 0) { - if (include_nuls) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length"); - return NULL; - } - - len = (glong)strlen (str); + if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + } else if (errno == MINIPAL_ERROR_NO_UNICODE_TRANSLATION) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Illegal byte sequence encountered in the input."); } +} - inptr = (char *) str; - inleft = len; - - while (inleft > 0) { - if ((n = decode_utf8 (inptr, inleft, &c)) < 0) - goto error; - - if (c == 0 && !include_nuls) - break; +static gunichar2 * +g_utf8_to_utf16_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err, int flags, bool treatAsLE) +{ + errno = 0; + gunichar2* lpDestStr = NULL; +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif - if ((u = g_unichar_to_utf16_endian (c, NULL, endian)) < 0) { - if (replace_invalid_codepoints) { - u = 2; - } else { - mono_set_errno (EILSEQ); - goto error; - } - } + if (len < 0) + len = (glong)strlen(str) + 1; - outlen += u; - inleft -= n; - inptr += n; - } + glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags); - if (items_read) - *items_read = GPTRDIFF_TO_LONG (inptr - str); + map_error(err); if (items_written) - *items_written = (glong)outlen; + *items_written = errno == 0 ? ret : 0; - if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2)); - else - outptr = outbuf = (gunichar2 *)custom_alloc_func ((outlen + 1) * sizeof (gunichar2), custom_alloc_data); + if (ret <= 0) + return NULL; - if (G_UNLIKELY (custom_alloc_func && !outbuf)) { - mono_set_errno (ENOMEM); - goto error; - } + lpDestStr = malloc((ret + 1) * sizeof(gunichar2)); + ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; - inptr = (char *) str; - inleft = len; + if (items_written) + *items_written = errno == 0 ? ret : 0; - while (inleft > 0) { - if ((n = decode_utf8 (inptr, inleft, &c)) < 0) - break; + map_error(err); + return lpDestStr; +} - if (c == 0 && !include_nuls) - break; +static gunichar2 * +g_utf8_to_utf16le_custom_alloc_impl (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, bool treatAsLE) +{ + guint flags = 0; + errno = 0; +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif + if (len < 0) + len = (glong)strlen(str) + 1; - u = g_unichar_to_utf16_endian (c, outptr, endian); - if ((u < 0) && replace_invalid_codepoints) { - outptr[0] = 0xFFFD; - outptr[1] = 0xFFFD; - u = 2; - } + glong ret = (glong)minipal_get_length_utf8_to_utf16 (str, len, flags); - outptr += u; - inleft -= n; - inptr += n; - } + map_error(err); - *outptr = '\0'; + if (items_written) + *items_written = errno == 0 ? ret : 0; - return outbuf; + if (ret <= 0) + return NULL; -error: - if (errno == ENOMEM) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, - "Allocation failed."); - } else if (errno == EILSEQ) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - "Illegal byte sequence encountered in the input."); - } else if (items_read) { - /* partial input is ok if we can let our caller know... */ - } else { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, - "Partial byte sequence encountered in the input."); + gunichar2 *lpDestStr = custom_alloc_func((ret + 1) * sizeof(gunichar2), custom_alloc_data); + if (G_UNLIKELY (!lpDestStr)) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + return NULL; } - if (items_read) - *items_read = GPTRDIFF_TO_LONG (inptr - str); - - if (items_written) - *items_written = 0; + flags |= MINIPAL_MB_NO_REPLACE_INVALID_CHARS; + ret = (glong)minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; - return NULL; + map_error(err); + return lpDestStr; } gunichar2 * g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BYTE_ORDER); -} - -gunichar2 * -g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_BIG_ENDIAN); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, false); } gunichar2 * g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, NULL, NULL, err, G_LITTLE_ENDIAN); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, MINIPAL_MB_NO_REPLACE_INVALID_CHARS, true); } gunichar2 * -g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + return g_utf8_to_utf16_impl (str, len, items_read, items_written, err, 0, false); } gunichar2 * -g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) +g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_BIG_ENDIAN); + return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, false); } gunichar2 * g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, FALSE, custom_alloc_func, custom_alloc_data, err, G_LITTLE_ENDIAN); -} - -gunichar2 * -eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, FALSE, NULL, NULL, err, G_BYTE_ORDER); -} - -gunichar2 * -eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, TRUE, NULL, NULL, err, G_BYTE_ORDER); + return g_utf8_to_utf16le_custom_alloc_impl (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, true); } gunichar * @@ -789,120 +499,89 @@ g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_wri return outbuf; } -static -gchar * -eg_utf16_to_utf8_general (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err, unsigned endian) +static gchar * +g_utf16_to_utf8_impl (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err, bool treatAsLE) { - char *inptr, *outbuf, *outptr; - size_t outlen = 0; - size_t inleft; - gunichar c; - int n; - - g_return_val_if_fail (str != NULL, NULL); - + guint flags = 0; + errno = 0; + gchar* lpDestStr = NULL; +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (treatAsLE) + flags |= MINIPAL_TREAT_AS_LITTLE_ENDIAN; +#endif if (len < 0) { len = 0; while (str[len]) len++; - } - - inptr = (char *) str; - inleft = len * 2; - - while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) { - if (n == -2 && inleft > 2) { - /* This means that the first UTF-16 char was read, but second failed */ - inleft -= 2; - inptr += 2; - } - if (errno == EILSEQ) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - "Illegal byte sequence encountered in the input."); - } else if (items_read) { - /* partial input is ok if we can let our caller know... */ - break; - } else { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, - "Partial byte sequence encountered in the input."); - } - - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); - - if (items_written) - *items_written = 0; - - return NULL; - } else if (c == 0) - break; - - outlen += g_unichar_to_utf8 (c, NULL); - inleft -= n; - inptr += n; + len++; } - if (items_read) - *items_read = GPTRDIFF_TO_LONG ((inptr - (char *) str) / 2); + glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, flags); + map_error(err); if (items_written) - *items_written = (glong)outlen; - - if (G_LIKELY (!custom_alloc_func)) - outptr = outbuf = g_malloc (outlen + 1); - else - outptr = outbuf = (char *)custom_alloc_func (outlen + 1, custom_alloc_data); + *items_written = errno == 0 ? ret : 0; - if (G_UNLIKELY (custom_alloc_func && !outbuf)) { - g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); - if (items_written) - *items_written = 0; + if (ret <= 0) return NULL; - } - - inptr = (char *) str; - inleft = len * 2; - - while (inleft > 0) { - if ((n = decode_utf16_endian (inptr, inleft, &c, endian)) < 0) - break; - else if (c == 0) - break; - outptr += g_unichar_to_utf8 (c, outptr); - inleft -= n; - inptr += n; - } + lpDestStr = (gchar *)g_malloc((ret + 1) * sizeof(gchar)); + ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, flags); + lpDestStr[ret] = '\0'; - *outptr = '\0'; + if (items_written) + *items_written = errno == 0 ? ret : 0; - return outbuf; + map_error(err); + return lpDestStr; } gchar * g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BYTE_ORDER); + return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ false); } gchar * g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_LITTLE_ENDIAN); -} - -gchar * -g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err) -{ - return eg_utf16_to_utf8_general (str, len, items_read, items_written, NULL, NULL, err, G_BIG_ENDIAN); + return g_utf16_to_utf8_impl (str, len, items_read, items_written, err, /* treatAsLE */ true); } gchar * g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err) { - return eg_utf16_to_utf8_general (str, len, items_read, items_written, custom_alloc_func, custom_alloc_data, err, G_BYTE_ORDER); + errno = 0; + + if (len < 0) { + len = 0; + while (str[len]) + len++; + + len++; + } + + glong ret = (glong)minipal_get_length_utf16_to_utf8 (str, len, 0); + map_error(err); + + if (items_written) + *items_written = errno == 0 ? ret : 0; + + if (ret <= 0) + return NULL; + + gchar *lpDestStr = custom_alloc_func((ret + 1) * sizeof (gunichar2), custom_alloc_data); + if (G_UNLIKELY (!lpDestStr)) { + g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY, "Allocation failed."); + return NULL; + } + + ret = (glong)minipal_convert_utf16_to_utf8 (str, len, lpDestStr, ret, 0); + lpDestStr[ret] = '\0'; + + map_error(err); + return lpDestStr; } gunichar * diff --git a/src/mono/mono/eglib/glib.h b/src/mono/mono/eglib/glib.h index e438c00298ec7..fcd8d2e37bdae 100644 --- a/src/mono/mono/eglib/glib.h +++ b/src/mono/mono/eglib/glib.h @@ -882,14 +882,11 @@ gunichar *g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_writte gunichar *g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); G_EXTERN_C // Used by libtest, at least. gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); -gunichar2 *g_utf8_to_utf16be (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *g_utf8_to_utf16le (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); -gunichar2 *eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *eg_wtf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err); G_EXTERN_C // Used by libtest, at least. gchar *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gchar *g_utf16le_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); -gchar *g_utf16be_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar *g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err); gchar *g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err); gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err); @@ -915,7 +912,6 @@ gpointer g_fixed_buffer_custom_allocator (gsize req_size, gpointer custom_alloc_data); gunichar2 *g_utf8_to_utf16_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); -gunichar2 *g_utf8_to_utf16be_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); gunichar2 *g_utf8_to_utf16le_custom_alloc (const gchar *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); gchar *g_utf16_to_utf8_custom_alloc (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GCustomAllocator custom_alloc_func, gpointer custom_alloc_data, GError **err); diff --git a/src/mono/mono/eglib/test/utf8.c b/src/mono/mono/eglib/test/utf8.c index d36dbfaa54ed0..5602bbcbcb720 100644 --- a/src/mono/mono/eglib/test/utf8.c +++ b/src/mono/mono/eglib/test/utf8.c @@ -155,7 +155,7 @@ compare_utf8_to_utf16_explicit (const gunichar2 *expected, const gchar *utf8, gl gerror = NULL; if (include_nuls) - ret = eg_utf8_to_utf16_with_nuls (utf8, size_spec, &in_read, &out_read, &gerror); + ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror); else ret = g_utf8_to_utf16 (utf8, size_spec, &in_read, &out_read, &gerror); @@ -271,7 +271,7 @@ test_utf8_to_utf16_with_nuls (void) #endif /* implicit length is forbidden */ - if (eg_utf8_to_utf16_with_nuls (src1, -1, NULL, NULL, NULL) != NULL) + if (g_utf8_to_utf16 (src1, -1, NULL, NULL, NULL) != NULL) return FAILED ("explicit nulls must fail with -1 length\n"); /* empty string */ @@ -699,7 +699,7 @@ utf8_byteslen (const gchar *src) static Test utf8_tests [] = { {"g_utf16_to_utf8", test_utf16_to_utf8}, {"g_utf8_to_utf16", test_utf8_to_utf16}, - {"g_utf8_to_utf16_with_nuls", test_utf8_to_utf16_with_nuls}, + {"g_utf8_to_utf16_nuls", test_utf8_to_utf16_with_nuls}, {"g_utf8_seq", test_utf8_seq}, {"g_ucs4_to_utf16", test_ucs4_to_utf16 }, {"g_utf16_to_ucs4", test_utf16_to_ucs4 }, diff --git a/src/mono/mono/metadata/object.c b/src/mono/mono/metadata/object.c index 8604114fe520f..b0289cebf414a 100644 --- a/src/mono/mono/metadata/object.c +++ b/src/mono/mono/metadata/object.c @@ -327,7 +327,7 @@ get_type_init_exception_for_vtable (MonoVTable *vtable) mono_mem_manager_init_reflection_hashes (mem_manager); - /* + /* * If the initializing thread was rudely aborted, the exception is not stored * in the hash. */ @@ -6361,7 +6361,7 @@ mono_string_new_utf8_len (const char *text, guint length, MonoError *error) gunichar2 *ut = NULL; glong items_written; - ut = eg_utf8_to_utf16_with_nuls (text, length, NULL, &items_written, &eg_error); + ut = g_utf8_to_utf16 (text, length, NULL, &items_written, &eg_error); if (eg_error) { o = NULL_HANDLE_STRING; diff --git a/src/mono/mono/mini/CMakeLists.txt b/src/mono/mono/mini/CMakeLists.txt index 8e60babe7bb83..6f5e8507315db 100644 --- a/src/mono/mono/mini/CMakeLists.txt +++ b/src/mono/mono/mini/CMakeLists.txt @@ -551,7 +551,7 @@ if(NOT DISABLE_EXECUTABLES) target_link_libraries(mono-sgen PRIVATE monoapi eglib_api monosgen-static) if(HAVE_ICU_SHIM) target_link_libraries(mono-sgen PRIVATE icu_shim_objects) - endif() + endif() target_link_libraries(mono-sgen PRIVATE ${OS_LIBS} ${LLVM_LIBS} ${ICU_LIBS} ${Z_LIBS}) # Alpine Linux implements ucontext in a different library if(CLR_CMAKE_HOST_ALPINE_LINUX AND TARGET_S390X) diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c new file mode 100644 index 0000000000000..a54b805540f89 --- /dev/null +++ b/src/native/minipal/utf8.c @@ -0,0 +1,2149 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include + +#include +#include +#include +#include + +#define HIGH_SURROGATE_START 0xd800 +#define HIGH_SURROGATE_END 0xdbff +#define LOW_SURROGATE_START 0xdc00 +#define LOW_SURROGATE_END 0xdfff + +// Test if the wide character is a high surrogate +static bool IsHighSurrogate(const CHAR16_T c) +{ + return (c & 0xFC00) == HIGH_SURROGATE_START; +} + +// Test if the wide character is a low surrogate +static bool IsLowSurrogate(const CHAR16_T c) +{ + return (c & 0xFC00) == LOW_SURROGATE_START; +} + +// Test if the wide character is a surrogate half +static bool IsSurrogate(const CHAR16_T c) +{ + return (c & 0xF800) == HIGH_SURROGATE_START; +} + +typedef struct +{ + // Store our default string + unsigned char* byteStart; + CHAR16_T* charEnd; + const CHAR16_T strDefault[2]; + int strDefaultLength; + int fallbackCount; + int fallbackIndex; +} DecoderBuffer; + +static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self) +{ + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + self->fallbackCount--; + self->fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (self->fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (self->fallbackCount == INT_MAX) + { + self->fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); + + return self->strDefault[self->fallbackIndex]; +} + +// Fallback Methods +static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self) +{ + // We expect no previous fallback in our buffer + // We can't call recursively but others might (note, we don't test on last char!!!) + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + if (self->strDefaultLength == 0) + return false; + + self->fallbackCount = self->strDefaultLength; + self->fallbackIndex = -1; + + return true; +} + +// Fallback the current byte by sticking it into the remaining char buffer. +// This can only be called by our encodings (other have to use the public fallback methods), so +// we can use our DecoderNLS here too (except we don't). +// Returns true if we are successful, false if we can't fallback the character (no buffer space) +// So caller needs to throw buffer space if return false. +// Right now this has both bytes and bytes[], since we might have extra bytes, hence the +// array, and we might need the index, hence the byte* +// Don't touch ref chars unless we succeed +static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd) +{ + assert(self->byteStart != NULL); + + bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self); + + // See if there's a fallback character and we have an output buffer then copy our string. + if (fallbackResult) + { + // Copy the chars to our output + CHAR16_T ch; + CHAR16_T* charTemp = *chars; + bool bHighSurrogate = false; + (void)bHighSurrogate; // unused in release build + while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0) + { + // Make sure no mixed up surrogates + if (IsSurrogate(ch)) + { + if (IsHighSurrogate(ch)) + { + // High Surrogate + assert(!bHighSurrogate); + bHighSurrogate = true; + } + else + { + // Low surrogate + assert(bHighSurrogate); + bHighSurrogate = false; + } + } + + if (charTemp >= self->charEnd) + { + // No buffer space + return false; + } + + *(charTemp++) = ch; + if (charTemp > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return false; + } + } + + // Need to make sure that bHighSurrogate isn't true + assert(!bHighSurrogate); + + // Now we aren't going to be false, so its OK to update chars + *chars = charTemp; + } + + return true; +} + +// Clear the buffer +static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self) +{ + self->fallbackCount = -1; + self->fallbackIndex = -1; + self->byteStart = NULL; +} + +typedef struct +{ + const CHAR16_T strDefault[3]; + int strDefaultLength; + CHAR16_T* charStart; + CHAR16_T* charEnd; + bool setEncoder; + bool bUsedEncoder; + bool bFallingBack; + int iRecursionCount; + int fallbackCount; + int fallbackIndex; +} EncoderBuffer; + +#define MAX_RECURSION 250 + +// Set the above values +// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. +static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder) +{ + self->charStart = charStart; + self->charEnd = charEnd; + self->setEncoder = setEncoder; + self->bUsedEncoder = false; + self->bFallingBack = false; + self->iRecursionCount = 0; +} + +static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self) +{ + // We want it to get < 0 because == 0 means that the current/last character is a fallback + // and we need to detect recursion. We could have a flag but we already have this counter. + self->fallbackCount--; + self->fallbackIndex++; + + // Do we have anything left? 0 is now last fallback char, negative is nothing left + if (self->fallbackCount < 0) + return '\0'; + + // Need to get it out of the buffer. + // Make sure it didn't wrap from the fast count-- path + if (self->fallbackCount == INT_MAX) + { + self->fallbackCount = -1; + return '\0'; + } + + // Now make sure its in the expected range + assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); + + CHAR16_T ch = self->strDefault[self->fallbackIndex]; + self->bFallingBack = (ch != 0); + if (ch == 0) self->iRecursionCount = 0; + return ch; +} + +// Fallback Methods +static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self) +{ + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + // Divide by 2 because we aren't a surrogate pair + self->fallbackCount = self->strDefaultLength / 2; + self->fallbackIndex = -1; + + return self->fallbackCount != 0; +} + +static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self) +{ + // If we had a buffer already we're being recursive, throw, it's probably at the suspect + // character in our array. + assert(self->fallbackCount < 1); + + // Go ahead and get our fallback + self->fallbackCount = self->strDefaultLength; + self->fallbackIndex = -1; + + return self->fallbackCount != 0; +} + +// Fallback the current character using the remaining buffer and encoder if necessary +// This can only be called by our encodings (other have to use the public fallback methods), so +// we can use our EncoderNLS here too. +// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount +// +// Note that this could also change the contents of self->buffer.encoder, which is the same +// object that the caller is using, so the caller could mess up the encoder for us +// if they aren't careful. +static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars) +{ + // Shouldn't have null charStart + assert(self->charStart != NULL); + + // See if it was a high surrogate + if (IsHighSurrogate(ch)) + { + // See if there's a low surrogate to go with it + if (*chars >= self->charEnd) + { + // Nothing left in input buffer + // No input, return 0 + } + else + { + // Might have a low surrogate + CHAR16_T cNext = **chars; + if (IsLowSurrogate(cNext)) + { + // If already falling back then fail + assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); + + // Next is a surrogate, add it as surrogate pair, and increment chars + (*chars)++; + self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self); + return self->bFallingBack; + } + + // Next isn't a low surrogate, just fallback the high surrogate + } + } + + // If already falling back then fail + assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); + + // Fall back our char + self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self); + + return self->bFallingBack; +} + +static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self) +{ + // Back up one, only if we just processed the last character (or earlier) + if (self->fallbackCount >= -1 && self->fallbackIndex >= 0) + { + self->fallbackIndex--; + self->fallbackCount++; + return true; + } + + // Return false 'cause we couldn't do it. + return false; +} + +typedef struct +{ + union + { + DecoderBuffer decoder; + EncoderBuffer encoder; + } buffer; + + bool useFallback; + +#if BIGENDIAN + bool treatAsLE; +#endif +} UTF8Encoding; + +// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits +// while the actual character is being built in the lower bits. They are shifted together +// with the actual bits of the character. + +// bits 30 & 31 are used for pending bits fixup +#define FinalByte (1 << 29) +#define SupplimentarySeq (1 << 28) +#define ThreeByteSeq (1 << 27) + +static bool InRange(int c, int begin, int end) +{ + return begin <= c && c <= end; +} + +// During GetChars we had an invalid byte sequence +// pSrc is backed up to the start of the bad sequence if we didn't have room to +// fall it back. Otherwise pSrc remains where it is. +static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd) +{ + assert(self->useFallback); + + // Get our byte[] + unsigned char* pStart = *pSrc; + bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd); + + // Do the actual fallback + if (!fallbackResult) + { + // Oops, it failed, back up to pStart + *pSrc = pStart; + return false; + } + + // It worked + return true; +} + +static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count) +{ + assert(bytes != NULL); + assert(count >= 0); + + // Initialize stuff + unsigned char *pSrc = bytes; + unsigned char *pEnd = pSrc + count; + int availableBytes, chc; + + // Start by assuming we have as many as count, charCount always includes the adjustment + // for the character being decoded + size_t charCount = count; + int ch = 0; + bool fallbackUsed = false; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) break; + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + // no pending bits + if (ch == 0) goto ReadChar; + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + charCount += (ch >> 30); + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); + + if ((ch & SupplimentarySeq) != 0) + { + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte (of 4 byte supplimentary) - nothing to do + continue; + } + + // 2nd byte, check for non-shortest form of supplimentary char and the valid + // supplimentary characters in range 0x010000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // adjust for surrogates in non-shortest form + if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--; + + goto EncodeChar; + + InvalidByteSequence: + if (!self->useFallback) + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + + if (!fallbackUsed) + { + fallbackUsed = true; + self->buffer.decoder.byteStart = bytes; + self->buffer.decoder.charEnd = NULL; + } + charCount += self->buffer.decoder.strDefaultLength; + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // Long sequence, so unreserve our char. + charCount--; + + // bit 6 has to be non-zero for start of multibyte chars. + if ((ch & 0x40) == 0) goto InvalidByteSequence; + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. + ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now + (1 << 30) | // If it dies on next byte we'll need an extra char + (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + + // Our character count will be 2 characters for these 4 bytes, so subtract another char + charCount--; + } + else + { + // 3 byte encoding + // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + + // We'll expect 1 character for these 3 bytes, so subtract another char. + charCount--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + // Add bit flags so we'll be flagged correctly + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + + availableBytes = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough bytes + if (availableBytes <= 13) + { + // try to get over the remainder of the ascii characters fast though + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + goto ProcessChar; + } + // we are done + ch = 0; + break; + } + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + unsigned char *pStop = pSrc + availableBytes - 7; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) + { + goto LongCode; + } + + // get pSrc 2-byte aligned + if (((size_t)pSrc & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) + { + goto LongCode; + } + } + + // get pSrc 4-byte aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *(unsigned short*)pSrc; + if ((ch & 0x8080) != 0) + { + goto LongCodeWithMask16; + } + pSrc += 2; + } + + + // Run 8 + 8 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + + // This is a really small loop - unroll it + if (pSrc >= pStop) + break; + + ch = *(int*)pSrc; + chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) + { + goto LongCodeWithMask32; + } + pSrc += 8; + } + break; + + LongCodeWithMask32 : +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch &= 0xFF; + + LongCodeWithMask16: +#if BIGENDIAN + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#endif + ch &= 0xFF; + + pSrc++; + if (ch <= 0x7F) + { + continue; + } + + LongCode: + chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc += 2; + + // extra byte + charCount--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + // extra byte + charCount--; + } + } + else + { + // 2 byte encoding + + // check for non-shortest form + if ((ch & 0x1E) == 0) goto BadLongCode; + } + + // extra byte + charCount--; + } + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + // May have a problem if we have to flush + if (ch != 0) + { + // We were already adjusting for these, so need to unadjust + charCount += (ch >> 30); + charCount += self->buffer.decoder.strDefaultLength; + } + + // Shouldn't have anything in fallback buffer for GetCharCount + // (don't have to check m_throwOnOverflow for count) + assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0); + + return charCount; +} + +#define ENSURE_BUFFER_INC \ + pTarget++; \ + if (pTarget > pAllocatedBufferEnd) \ + { \ + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \ + return 0; \ + } + +static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount) +{ + assert(chars != NULL); + assert(byteCount >= 0); + assert(charCount >= 0); + assert(bytes != NULL); + + unsigned char *pSrc = bytes; + CHAR16_T *pTarget = chars; + + unsigned char *pEnd = pSrc + byteCount; + CHAR16_T *pAllocatedBufferEnd = pTarget + charCount; + + int ch = 0; + int chc; + + bool fallbackUsed = false; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) break; + + // read next byte. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + int cha = *pSrc; + + if (ch == 0) + { + // no pending bits + goto ReadChar; + } + + pSrc++; + + // we are expecting to see trailing bytes like 10vvvvvv + if ((cha & 0xC0) != 0x80) + { + // This can be a valid starting byte for another UTF8 byte sequence, so let's put + // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence + pSrc--; + goto InvalidByteSequence; + } + + // fold in the new byte + ch = (ch << 6) | (cha & 0x3F); + + if ((ch & FinalByte) == 0) + { + // Not at last byte yet + assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); + + if ((ch & SupplimentarySeq) != 0) + { + // Its a 4-byte supplimentary sequence + if ((ch & (FinalByte >> 6)) != 0) + { + // this is 3rd byte of 4 byte sequence - nothing to do + continue; + } + + // 2nd byte of 4 bytes + // check for non-shortest form of surrogate and the valid surrogate + // range 0x000000 - 0x10FFFF at the same time + if (!InRange(ch & 0x1F0, 0x10, 0x100)) + { + goto InvalidByteSequence; + } + } + else + { + // Must be 2nd byte of a 3-byte sequence + // check for non-shortest form of 3 byte seq + if ((ch & (0x1F << 5)) == 0 || // non-shortest form + (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + { + goto InvalidByteSequence; + } + } + continue; + } + + // ready to punch + + // surrogate in shortest form? + // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? + if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) + { + // let the range check for the second char throw the exception + if (pTarget < pAllocatedBufferEnd) + { + *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + + (HIGH_SURROGATE_START - (0x10000 >> 10))); + + ENSURE_BUFFER_INC + + ch = (ch & 0x3FF) + + (int)(LOW_SURROGATE_START); + } + } + + goto EncodeChar; + + InvalidByteSequence: + if (!self->useFallback) + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + + // this code fragment should be close to the gotos referencing it + // Have to do fallback for invalid bytes + if (!fallbackUsed) + { + fallbackUsed = true; + self->buffer.decoder.byteStart = bytes; + self->buffer.decoder.charEnd = pAllocatedBufferEnd; + } + + // That'll back us up the appropriate # of bytes if we didn't get anywhere + if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd)) + { + if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0; + + // Check if we ran out of buffer space + assert(pSrc >= bytes); + + DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); + ch = 0; + break; + } + + assert(pSrc >= bytes); + + ch = 0; + continue; + + ReadChar: + ch = *pSrc; + pSrc++; + + ProcessChar: + if (ch > 0x7F) + { + // If its > 0x7F, its start of a new multi-byte sequence + + // bit 6 has to be non-zero + if ((ch & 0x40) == 0) goto InvalidByteSequence; + + // start a new long code + if ((ch & 0x20) != 0) + { + if ((ch & 0x10) != 0) + { + // 4 byte encoding - supplimentary character (2 surrogates) + + ch &= 0x0F; + + // check that bit 4 is zero and the valid supplimentary character + // range 0x000000 - 0x10FFFF at the same time + if (ch > 0x04) + { + ch |= 0xf0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | + (SupplimentarySeq) | (SupplimentarySeq >> 6) | + (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); + } + else + { + // 3 byte encoding + ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | + (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) + { + ch |= 0xc0; + goto InvalidByteSequence; + } + + ch |= (FinalByte >> 6); + } + continue; + } + + EncodeChar: + // write the pending character + if (pTarget >= pAllocatedBufferEnd) + { + // Fix chars so we make sure to throw if we didn't output anything + ch &= 0x1fffff; + if (ch > 0x7f) + { + if (ch > 0x7ff) + { + if (ch >= LOW_SURROGATE_START && + ch <= LOW_SURROGATE_END) + { + pSrc--; // It was 4 bytes + pTarget--; // 1 was stored already, but we can't remember 1/2, so back up + } + else if (ch > 0xffff) + { + pSrc--; // It was 4 bytes, nothing was stored + } + pSrc--; // It was at least 3 bytes + } + pSrc--; // It was at least 2 bytes + } + pSrc--; + + assert(pSrc >= bytes); + + // Don't store ch in decoder, we already backed up to its start + ch = 0; + + // Didn't throw, just use this buffer size. + break; + } + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + + int availableChars = pAllocatedBufferEnd - pTarget; + int availableBytes = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough bytes + // Test for availableChars is done because pStop would be <= pTarget. + if (availableBytes <= 13) + { + // we may need as many as 1 character per byte + if (availableChars < availableBytes) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto ProcessChar; + + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + } + // we are done + ch = 0; + break; + } + + // we may need as many as 1 character per byte, so reduce the byte count if necessary. + // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. + if (availableChars < availableBytes) availableBytes = availableChars; + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences + CHAR16_T *pStop = pTarget + availableBytes - 7; + + while (pTarget < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + + // get pSrc to be 2-byte aligned + if ((((size_t)pSrc) & 0x1) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto LongCode; + + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + } + + // get pSrc to be 4-byte aligned + if ((((size_t)pSrc) & 0x2) != 0) + { + ch = *(unsigned short*)pSrc; + if ((ch & 0x8080) != 0) goto LongCodeWithMask16; + + + if (pTarget + 2 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (CHAR16_T)((ch >> 8) & 0x7F); + pSrc += 2; + *(pTarget + 1) = (CHAR16_T)(ch & 0x7F); + pTarget += 2; + } + else +#endif + { + *pTarget = (CHAR16_T)(ch & 0x7F); + pSrc += 2; + *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); + pTarget += 2; + } + } + + // Run 8 characters at a time! + while (pTarget < pStop) + { + ch = *(int*)pSrc; + int chb = *(int*)(pSrc + 4); + if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32; + + if (pTarget + 8 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (CHAR16_T)((ch >> 24) & 0x7F); + *(pTarget + 1) = (CHAR16_T)((ch >> 16) & 0x7F); + *(pTarget + 2) = (CHAR16_T)((ch >> 8) & 0x7F); + *(pTarget + 3) = (CHAR16_T)(ch & 0x7F); + pSrc += 8; + *(pTarget + 4) = (CHAR16_T)((chb >> 24) & 0x7F); + *(pTarget + 5) = (CHAR16_T)((chb >> 16) & 0x7F); + *(pTarget + 6) = (CHAR16_T)((chb >> 8) & 0x7F); + *(pTarget + 7) = (CHAR16_T)(chb & 0x7F); + pTarget += 8; + } + else +#endif + { + *pTarget = (CHAR16_T)(ch & 0x7F); + *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); + *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F); + *(pTarget + 3) = (CHAR16_T)((ch >> 24) & 0x7F); + pSrc += 8; + *(pTarget + 4) = (CHAR16_T)(chb & 0x7F); + *(pTarget + 5) = (CHAR16_T)((chb >> 8) & 0x7F); + *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F); + *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F); + pTarget += 8; + } + } + break; + + LongCodeWithMask32 : +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch &= 0xFF; + + LongCodeWithMask16: +#if BIGENDIAN + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); + else +#endif + ch &= 0xFF; + + pSrc++; + if (ch <= 0x7F) + { + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + continue; + } + + LongCode: + chc = *pSrc; + pSrc++; + + if ( + // bit 6 has to be zero + (ch & 0x40) == 0 || + // we are expecting to see trailing bytes like 10vvvvvv + (chc & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc &= 0x3F; + + // start a new long code + if ((ch & 0x20) != 0) + { + + // fold the first two bytes together + chc |= (ch & 0x0F) << 6; + + if ((ch & 0x10) != 0) + { + // 4 byte encoding - surrogate + ch = *pSrc; + if ( + // check that bit 4 is zero, the non-shortest form of surrogate + // and the valid surrogate range 0x000000 - 0x10FFFF at the same time + !InRange(chc >> 4, 0x01, 0x10) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + + chc = (chc << 6) | (ch & 0x3F); + + ch = *(pSrc + 1); + // we are expecting to see trailing bytes like 10vvvvvv + if ((ch & 0xC0) != 0x80) goto BadLongCode; + + pSrc += 2; + + ch = (chc << 6) | (ch & 0x3F); + + *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + + (HIGH_SURROGATE_START - (0x10000 >> 10))); + ENSURE_BUFFER_INC + + ch = (ch & 0x3FF) + (LOW_SURROGATE_START); + + // extra byte, we're already planning 2 chars for 2 of these bytes, + // but the big loop is testing the target against pStop, so we need + // to subtract 2 more or we risk overrunning the input. Subtract + // one here and one below. + pStop--; + } + else + { + // 3 byte encoding + ch = *pSrc; + if ( + // check for non-shortest form of 3 byte seq + (chc & (0x1F << 5)) == 0 || + // Can't have surrogates here. + (chc & (0xF800 >> 6)) == (0xD800 >> 6) || + // we are expecting to see trailing bytes like 10vvvvvv + (ch & 0xC0) != 0x80) + { + goto BadLongCode; + } + pSrc++; + + ch = (chc << 6) | (ch & 0x3F); + + // extra byte, we're only expecting 1 char for each of these 3 bytes, + // but the loop is testing the target (not source) against pStop, so + // we need to subtract 2 more or we risk overrunning the input. + // Subtract 1 here and one more below + pStop--; + } + } + else + { + // 2 byte encoding + + ch &= 0x1F; + + // check for non-shortest form + if (ch <= 1) goto BadLongCode; + + ch = (ch << 6) | chc; + } + + *pTarget = (CHAR16_T)ch; + ENSURE_BUFFER_INC + + // extra byte, we're only expecting 1 char for each of these 2 bytes, + // but the loop is testing the target (not source) against pStop. + // subtract an extra count from pStop so that we don't overrun the input. + pStop--; + } + + assert(pTarget <= pAllocatedBufferEnd); + + // no pending bits at this point + ch = 0; + continue; + + BadLongCode: + pSrc -= 2; + ch = 0; + continue; + } + + if (ch != 0) + { + // This'll back us up the appropriate # of bytes if we didn't get anywhere + if (!self->useFallback) + { + assert(pSrc >= bytes || pTarget == chars); + + // Ran out of buffer space + // Need to throw an exception? + if (pTarget == chars) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + } + assert(pSrc >= bytes); + ch = 0; + } + + // Shouldn't have anything in fallback buffer for GetChars + // (don't have to check m_throwOnOverflow for chars) + assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0); + + if (pSrc < pEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + return pTarget - chars; +} + +static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount) +{ + assert(chars != NULL); + assert(byteCount >= 0); + assert(charCount >= 0); + assert(bytes != NULL); + + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + bool fallbackUsed = false; + CHAR16_T *pSrc = chars; + unsigned char *pTarget = bytes; + + CHAR16_T *pEnd = pSrc + charCount; + unsigned char *pAllocatedBufferEnd = pTarget + byteCount; + + int ch = 0; + int chd; + + // assume that JIT will enregister pSrc, pTarget and ch + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + + if (pSrc >= pEnd) + { + if (ch == 0) + { + // Check if there's anything left to get out of the fallback buffer + ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; + if (ch > 0) goto ProcessChar; + } + else + { + // Case of leftover surrogates in the fallback buffer + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + int cha = ch; + + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + + if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = ch + (cha << 10) + (0x10000 - LOW_SURROGATE_START - (HIGH_SURROGATE_START << 10)); + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + + break; + } + } + + // attempt to encode the partial surrogate (will fail or ignore) + if (ch > 0) goto EncodeChar; + + // We're done + break; + } + + if (ch > 0) + { + // We have a high surrogate left over from a previous loop. + assert(ch >= 0xD800 && ch <= 0xDBFF); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = cha + (ch << 10) + + (0x10000 + - LOW_SURROGATE_START + - (HIGH_SURROGATE_START << 10)); + + pSrc++; + } + // else ch is still high surrogate and encoding will fail + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackUsed) + { + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + if (ch > 0) goto ProcessChar; + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) continue; + + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed, we have to do fallback for them + // Have to make a fallback buffer if we don't have one + if (!fallbackUsed) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackUsed = true; + + // Set our internal fallback interesting things. + EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, pEnd, true); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); + + // Ignore it if we don't throw + ch = 0; + continue; + } + + // Count bytes needed + int bytesNeeded = 1; + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + if (ch > 0xFFFF) + { + bytesNeeded++; // 4 bytes (surrogate pair) + } + bytesNeeded++; // 3 bytes (800-FFFF) + } + bytesNeeded++; // 2 bytes (80-7FF) + } + + if (pTarget > pAllocatedBufferEnd - bytesNeeded) + { + // Left over surrogate from last time will cause pSrc == chars, so we'll throw + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Didn't use this fallback char + if (ch > 0xFFFF) + EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Was surrogate, didn't use 2nd part either + } + else + { + pSrc--; // Didn't use this char + if (ch > 0xFFFF) + pSrc--; // Was surrogate, didn't use 2nd part either + } + + assert(pSrc >= chars || pTarget == bytes); + + if (pTarget == bytes) // Throw if we must + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) + break; + } + + if (ch <= 0x7F) + { + *pTarget = (unsigned char)ch; + } + else + { + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int chb; + if (ch <= 0x7FF) + { + // 2 unsigned char encoding + chb = (unsigned char)(0xC0 | (ch >> 6)); + } + else + { + if (ch <= 0xFFFF) + { + chb = (unsigned char)(0xE0 | (ch >> 12)); + } + else + { + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); + ENSURE_BUFFER_INC + + chb = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (unsigned char)chb; + ENSURE_BUFFER_INC + + chb = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (unsigned char)chb; + ENSURE_BUFFER_INC + + *pTarget = (unsigned char)0x80 | (ch & 0x3F); + } + + ENSURE_BUFFER_INC + + // If still have fallback don't do fast loop + if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) + goto ProcessChar; + + int availableChars = pEnd - pSrc; + int availableBytes = pAllocatedBufferEnd - pTarget; + + // don't fall into the fast decoding loop if we don't have enough characters + // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. + if (availableChars <= 13) + { + // we are hoping for 1 unsigned char per char + if (availableBytes < availableChars) + { + // not enough output room. no pending bits at this point + ch = 0; + continue; + } + + // try to get over the remainder of the ascii characters fast though + CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + + // Not ASCII, need more than 1 unsigned char per char + if (ch > 0x7F) goto ProcessChar; + + *pTarget = (unsigned char)ch; + ENSURE_BUFFER_INC + } + // we are done, let ch be 0 to clear encoder + ch = 0; + break; + } + + // we need at least 1 unsigned char per character, but Convert might allow us to convert + // only part of the input, so try as much as we can. Reduce charCount if necessary + if (availableBytes < availableChars) + { + availableChars = availableBytes; + } + + // FASTLOOP: + // - optimistic range checks + // - fallbacks to the slow loop for all special cases, exception throwing, etc. + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. + CHAR16_T *pStop = pSrc + availableChars - 5; + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + ENSURE_BUFFER_INC + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + ENSURE_BUFFER_INC + } + + // Run 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + + if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; + + if (pTarget + 4 > pAllocatedBufferEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + // Unfortunately, this is endianness sensitive +#if BIGENDIAN + if (!self->treatAsLE) + { + *pTarget = (unsigned char)(ch >> 16); + *(pTarget + 1) = (unsigned char)ch; + pSrc += 4; + *(pTarget + 2) = (unsigned char)(chc >> 16); + *(pTarget + 3) = (unsigned char)chc; + pTarget += 4; + } + else +#endif + { + *pTarget = (unsigned char)ch; + *(pTarget + 1) = (unsigned char)(ch >> 16); + pSrc += 4; + *(pTarget + 2) = (unsigned char)chc; + *(pTarget + 3) = (unsigned char)(chc >> 16); + pTarget += 4; + } + } + continue; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch = (CHAR16_T)ch; + pSrc++; + + if (ch > 0x7F) goto LongCode; + + *pTarget = (unsigned char)ch; + ENSURE_BUFFER_INC + continue; + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch <= 0x7FF) + { + // 2 unsigned char encoding + chd = 0xC0 | (ch >> 6); + } + else + { + if (!InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // 3 unsigned char encoding + chd = 0xE0 | (ch >> 12); + } + else + { + // 4 unsigned char encoding - high surrogate + low surrogate + if (ch > HIGH_SURROGATE_END) + { + // low without high -> bad, try again in slow loop + pSrc -= 1; + break; + } + + chd = *pSrc; + pSrc++; + + if (!InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // high not followed by low -> bad, try again in slow loop + pSrc -= 2; + break; + } + + ch = chd + (ch << 10) + + (0x10000 + - LOW_SURROGATE_START + - (HIGH_SURROGATE_START << 10)); + + *pTarget = (unsigned char)(0xF0 | (ch >> 18)); + // pStop - this unsigned char is compensated by the second surrogate character + // 2 input chars require 4 output bytes. 2 have been anticipated already + // and 2 more will be accounted for by the 2 pStop-- calls below. + ENSURE_BUFFER_INC + + chd = 0x80 | ((ch >> 12) & 0x3F); + } + *pTarget = (unsigned char)chd; + pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. + ENSURE_BUFFER_INC + + chd = 0x80 | ((ch >> 6) & 0x3F); + } + *pTarget = (unsigned char)chd; + pStop--; // 2 unsigned char sequence for 1 char so need pStop--. + ENSURE_BUFFER_INC + + *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); + // pStop - this unsigned char is already included + ENSURE_BUFFER_INC + } + + assert(pTarget <= pAllocatedBufferEnd); + + // no pending char at this point + ch = 0; + } + + if (pSrc < pEnd) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } + + return (int)(pTarget - bytes); +} + +static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) +{ + // For fallback we may need a fallback buffer. + // We wait to initialize it though in case we don't have any broken input unicode + bool fallbackUsed = false; + CHAR16_T *pSrc = chars; + CHAR16_T *pEnd = pSrc + count; + + // Start by assuming we have as many as count + size_t byteCount = count; + + int ch = 0; + + while (true) + { + // SLOWLOOP: does all range checks, handles all special cases, but it is slow + if (pSrc >= pEnd) + { + + if (ch == 0) + { + // Unroll any fallback that happens at the end + ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; + if (ch > 0) + { + byteCount++; + goto ProcessChar; + } + } + else + { + // Case of surrogates in the fallback. + if (fallbackUsed && self->buffer.encoder.bFallingBack) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + byteCount++; + + if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + ch = 0xfffd; + byteCount++; + goto EncodeChar; + } + else if (ch > 0) + { + goto ProcessChar; + } + else + { + byteCount--; // ignore last one. + break; + } + } + } + + if (ch <= 0) + { + break; + } + + // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. + byteCount++; + goto EncodeChar; + } + + if (ch > 0) + { + assert(ch >= 0xD800 && ch <= 0xDBFF); + + // use separate helper variables for local contexts so that the jit optimizations + // won't get confused about the variable lifetimes + int cha = *pSrc; + + // count the pending surrogate + byteCount++; + + // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. + if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. + ch = 0xfffd; + // ch = cha + (ch << 10) + + // (0x10000 + // - LOW_SURROGATE_START + // - (HIGH_SURROGATE_START << 10) ); + + // Use this next char + pSrc++; + } + // else ch is still high surrogate and encoding will fail (so don't add count) + + // attempt to encode the surrogate or partial surrogate + goto EncodeChar; + } + + // If we've used a fallback, then we have to check for it + if (fallbackUsed) + { + ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); + if (ch > 0) + { + // We have an extra byte we weren't expecting. + byteCount++; + goto ProcessChar; + } + } + + // read next char. The JIT optimization seems to be getting confused when + // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead + ch = *pSrc; + pSrc++; + + ProcessChar: + if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) + { + // we will count this surrogate next time around + byteCount--; + continue; + } + // either good char or partial surrogate + + EncodeChar: + // throw exception on partial surrogate if necessary + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // Lone surrogates aren't allowed + // Have to make a fallback buffer if we don't have one + if (!fallbackUsed) + { + // wait on fallbacks if we can + // For fallback we may need a fallback buffer + fallbackUsed = true; + + // Set our internal fallback interesting things. + EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, chars + count, false); + } + + // Do our fallback. Actually we already know its a mixed up surrogate, + // so the ref pSrc isn't gonna do anything. + EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); + + // Ignore it if we don't throw (we had preallocated this ch) + byteCount--; + ch = 0; + continue; + } + + // Count them + if (ch > 0x7F) + { + if (ch > 0x7FF) + { + // the extra surrogate byte was compensated by the second surrogate character + // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) + byteCount++; + } + byteCount++; + } + +#if WIN64 + // check for overflow + if (byteCount < 0) + { + break; + } +#endif + + // If still have fallback don't do fast loop + if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) + { + // We're reserving 1 byte for each char by default + byteCount++; + goto ProcessChar; + } + + int availableChars = pEnd - pSrc; + + // don't fall into the fast decoding loop if we don't have enough characters + if (availableChars <= 13) + { + // try to get over the remainder of the ascii characters fast though + CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered + while (pSrc < pLocalEnd) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) goto ProcessChar; + } + + // we are done + break; + } + +#if WIN64 + // make sure that we won't get a silent overflow inside the fast loop + // (Fall out to slow loop if we have this many characters) + availableChars &= 0x0FFFFFFF; +#endif + + // To compute the upper bound, assume that all characters are ASCII characters at this point, + // the boundary will be decreased for every non-ASCII character we encounter + // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates + CHAR16_T *pStop = pSrc + availableChars - (3 + 4); + + while (pSrc < pStop) + { + ch = *pSrc; + pSrc++; + + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + + // get pSrc aligned + if (((size_t)pSrc & 0x2) != 0) + { + ch = *pSrc; + pSrc++; + if (ch > 0x7F) // Not ASCII + { + if (ch > 0x7FF) // Not 2 Byte + { + if ((ch & 0xF800) == 0xD800) // See if its a Surrogate + goto LongCode; + byteCount++; + } + byteCount++; + } + } + + // Run 2 * 4 characters at a time! + while (pSrc < pStop) + { + ch = *(int*)pSrc; + int chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + + if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + + ch = *(int*)pSrc; + chc = *(int*)(pSrc + 2); + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII + { + if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte + { + goto LongCodeWithMask; + } + + if ((ch & (int)0xFF800000) != 0) + byteCount++; + if ((ch & (int)0xFF80) != 0) + byteCount++; + if ((chc & (int)0xFF800000) != 0) + byteCount++; + if ((chc & (int)0xFF80) != 0) + byteCount++; + } + pSrc += 4; + } + break; + + LongCodeWithMask: +#if BIGENDIAN + // be careful about the sign extension + if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + else +#endif + ch = (CHAR16_T)ch; + + pSrc++; + + if (ch <= 0x7F) + { + continue; + } + + LongCode: + // use separate helper variables for slow and fast loop so that the jit optimizations + // won't get confused about the variable lifetimes + if (ch > 0x7FF) + { + if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) + { + // 4 byte encoding - high surrogate + low surrogate + + int chd = *pSrc; + if ( + ch > HIGH_SURROGATE_END || + !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) + { + // Back up and drop out to slow loop to figure out error + pSrc--; + break; + } + pSrc++; + + // byteCount - this byte is compensated by the second surrogate character + } + byteCount++; + } + byteCount++; + + // byteCount - the last byte is already included + } + + // no pending char at this point + ch = 0; + } + +#if WIN64 + // check for overflow + assert(byteCount >= 0); +#endif + assert(!fallbackUsed || self->buffer.encoder.fallbackCount < 0); + + return byteCount; +} + +size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags) +{ + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, + .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + + return GetCharCount(&enc, (unsigned char*)source, sourceLength); +} + +size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags) +{ + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + // repeat replacement char (0xFFFD) twice for a surrogate pair + .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, + .useFallback = true, +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + +#if !BIGENDIAN + (void)flags; // unused +#endif + + return GetByteCount(&enc, (CHAR16_T*)source, sourceLength); +} + +size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags) +{ + size_t ret; + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, + .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + + ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength); + if (errno) ret = 0; + + return ret; +} + +size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags) +{ + size_t ret; + errno = 0; + + if (sourceLength == 0) + return 0; + + UTF8Encoding enc = + { + // repeat replacement char (0xFFFD) twice for a surrogate pair + .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, + .useFallback = true, +#if BIGENDIAN + .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) +#endif + }; + +#if !BIGENDIAN + (void)flags; // unused +#endif + + ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength); + if (errno) ret = 0; + + return ret; +} diff --git a/src/native/minipal/utf8.h b/src/native/minipal/utf8.h new file mode 100644 index 0000000000000..bd648f137a2bb --- /dev/null +++ b/src/native/minipal/utf8.h @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#ifndef HAVE_MINIPAL_UTF8_H +#define HAVE_MINIPAL_UTF8_H + +#include +#include +#include + +#define MINIPAL_MB_NO_REPLACE_INVALID_CHARS 0x00000008 +#define MINIPAL_TREAT_AS_LITTLE_ENDIAN 0x00000016 +#define MINIPAL_ERROR_INSUFFICIENT_BUFFER 122L +#define MINIPAL_ERROR_NO_UNICODE_TRANSLATION 1113L + +#ifdef __cplusplus +extern "C" +{ +#endif // __cplusplus + +#ifdef TARGET_WINDOWS +typedef wchar_t CHAR16_T; +#else +typedef unsigned short CHAR16_T; +#endif + +/** + * Get length of destination needed for UTF-8 to UTF-16 (UCS-2) conversion + * + * @param source The source string in UTF-8 format. + * @param sourceLength Length of the source string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Length of UTF-16 buffer required by the conversion. + */ +size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags); + +/** + * Get length of destination needed for UTF-16 (UCS-2) to UTF-8 conversion + * + * @param source The source string in UTF-16 format. + * @param sourceLength Length of the source string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Length of UTF-8 buffer required by the conversion. + */ +size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags); + +/** + * Convert a string from UTF-8 to UTF-16 (UCS-2) with preallocated memory + * + * @param source The source string in UTF-8 format. + * @param sourceLength Length of the source string. + * @param destination Pointer to the destination UTF-16 string. It can be NULL to query number of items required by the conversion. + * @param destinationLength Length of the destination string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Number of items written by the conversion. + */ +size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags); + +/** + * Convert a string from UTF-16 (UCS-2) to UTF-8 with preallocated memory + * + * @param source The source string in UTF-16 format. + * @param sourceLength Length of the source string. + * @param destination Pointer to the destination UTF-8 string. It can be NULL to query number of items required by the conversion. + * @param destinationLength Length of the destination string. + * @param flags Flags to alter the behavior of converter. Supported flags are MINIPAL_MB_NO_REPLACE_INVALID_CHARS and MINIPAL_TREAT_AS_LITTLE_ENDIAN. + * @return Number of items written by the conversion. + */ +size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif /* HAVE_MINIPAL_UTF8_H */