Skip to content

Commit

Permalink
Share UTF8 converters between coreclr and mono (#85558)
Browse files Browse the repository at this point in the history
* Share UTF8 converters between coreclr and mono - v1

* Revert "Share UTF8 converters between coreclr and mono - v1"

This reverts commit f9845ac6f53dc95fb747eb21351dfa9412397217.

* Share UTF8 converters between coreclr and mono - v2

* Remove C++ runtime dependency

* Initial C++ to C conversion

* Delete unused macros

* Fix custom alloc in mono

* Error on invalid sequences when caller requested

* Remove count from convert APIs
  • Loading branch information
am11 committed Jun 22, 2023
1 parent 28151b5 commit 2f2aaae
Show file tree
Hide file tree
Showing 14 changed files with 2,378 additions and 3,633 deletions.
179 changes: 0 additions & 179 deletions src/coreclr/inc/utilcode.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,15 +185,6 @@ typedef LPSTR LPUTF8;
// given and ANSI String, copy it into a wide buffer.
// be careful about scoping when using this macro!
//
// how to use the below two macros:
//
// ...
// LPSTR pszA;
// pszA = MyGetAnsiStringRoutine();
// MAKE_WIDEPTR_FROMANSI(pwsz, pszA);
// MyUseWideStringRoutine(pwsz);
// ...
//
// similarily for MAKE_ANSIPTR_FROMWIDE. note that the first param does not
// have to be declared, and no clean up must be done.
//
Expand All @@ -211,25 +202,6 @@ typedef LPSTR LPUTF8;
#define MAKE_TRANSLATIONFAILED ThrowWin32(ERROR_NO_UNICODE_TRANSLATION)
#endif

// This version throws on conversion errors (ie, no best fit character
// mapping to characters that look similar, and no use of the default char
// ('?') when printing out unrepresentable characters. Use this method for
// most development in the EE, especially anything like metadata or class
// names. See the BESTFIT version if you're printing out info to the console.
#define MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, codepage) \
int __l##ptrname = (int)u16_strlen(widestr); \
if (__l##ptrname > MAKE_MAX_LENGTH) \
MAKE_TOOLONGACTION; \
__l##ptrname = (int)((__l##ptrname + 1) * 2 * sizeof(char)); \
CQuickBytes __CQuickBytes##ptrname; \
__CQuickBytes##ptrname.AllocThrows(__l##ptrname); \
BOOL __b##ptrname; \
DWORD __cBytes##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, -1, (LPSTR)__CQuickBytes##ptrname.Ptr(), __l##ptrname, NULL, &__b##ptrname); \
if (__b##ptrname || (__cBytes##ptrname == 0 && (widestr[0] != W('\0')))) { \
MAKE_TRANSLATIONFAILED; \
} \
LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()

// This version does best fit character mapping and also allows the use
// of the default char ('?') for any Unicode character that isn't
// representable. This is reasonable for writing to the console, but
Expand All @@ -247,40 +219,6 @@ typedef LPSTR LPUTF8;
} \
LPSTR ptrname = (LPSTR)__CQuickBytes##ptrname.Ptr()

// Use for anything critical other than output to console, where weird
// character mappings are unacceptable.
#define MAKE_ANSIPTR_FROMWIDE(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE(ptrname, widestr, CP_ACP)

// Use for output to the console.
#define MAKE_ANSIPTR_FROMWIDE_BESTFIT(ptrname, widestr) MAKE_MULTIBYTE_FROMWIDE_BESTFIT(ptrname, widestr, CP_ACP)

#define MAKE_WIDEPTR_FROMANSI(ptrname, ansistr) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
__l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
if (__l##ptrname > MAKE_MAX_LENGTH) \
MAKE_TOOLONGACTION; \
LPWSTR ptrname = (LPWSTR) __qb##ptrname.AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \
if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) == 0) { \
MAKE_TRANSLATIONFAILED; \
}

#define MAKE_WIDEPTR_FROMANSI_NOTHROW(ptrname, ansistr) \
CQuickBytes __qb##ptrname; \
LPWSTR ptrname = 0; \
int __l##ptrname; \
__l##ptrname = WszMultiByteToWideChar(CP_ACP, 0, ansistr, -1, 0, 0); \
if (__l##ptrname <= MAKE_MAX_LENGTH) { \
ptrname = (LPWSTR) __qb##ptrname.AllocNoThrow((__l##ptrname+1)*sizeof(WCHAR)); \
if (ptrname) { \
if (WszMultiByteToWideChar(CP_ACP, MB_ERR_INVALID_CHARS, ansistr, -1, ptrname, __l##ptrname) != 0) { \
ptrname[__l##ptrname] = 0; \
} else { \
ptrname = 0; \
} \
} \
}

#define MAKE_UTF8PTR_FROMWIDE(ptrname, widestr) CQuickBytes _##ptrname; _##ptrname.ConvertUnicode_Utf8(widestr); LPSTR ptrname = (LPSTR) _##ptrname.Ptr();

#define MAKE_UTF8PTR_FROMWIDE_NOTHROW(ptrname, widestr) \
Expand Down Expand Up @@ -312,22 +250,8 @@ typedef LPSTR LPUTF8;
} \
} \

#define MAKE_WIDEPTR_FROMUTF8N(ptrname, utf8str, n8chrs) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
__l##ptrname = WszMultiByteToWideChar(CP_UTF8, 0, utf8str, n8chrs, 0, 0); \
if (__l##ptrname > MAKE_MAX_LENGTH) \
MAKE_TOOLONGACTION; \
LPWSTR ptrname = (LPWSTR) __qb##ptrname .AllocThrows((__l##ptrname+1)*sizeof(WCHAR)); \
if (0==WszMultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8str, n8chrs, ptrname, __l##ptrname)) { \
MAKE_TRANSLATIONFAILED; \
} \
ptrname[__l##ptrname] = 0;


#define MAKE_WIDEPTR_FROMUTF8(ptrname, utf8str) CQuickBytes _##ptrname; _##ptrname.ConvertUtf8_Unicode(utf8str); LPCWSTR ptrname = (LPCWSTR) _##ptrname.Ptr();


#define MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, n8chrs) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
Expand All @@ -346,42 +270,10 @@ typedef LPSTR LPUTF8;

#define MAKE_WIDEPTR_FROMUTF8_NOTHROW(ptrname, utf8str) MAKE_WIDEPTR_FROMUTF8N_NOTHROW(ptrname, utf8str, -1)

// This method takes the number of characters
#define MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, codepage) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
__l##ptrname = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, NULL, 0, NULL, NULL); \
if (__l##ptrname > MAKE_MAX_LENGTH) \
MAKE_TOOLONGACTION; \
ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
BOOL __b##ptrname; \
DWORD _pCnt = WszWideCharToMultiByte(codepage, WC_NO_BEST_FIT_CHARS, widestr, _nCharacters, ptrname, __l##ptrname, NULL, &__b##ptrname); \
if (__b##ptrname || (_pCnt == 0 && _nCharacters > 0)) { \
MAKE_TRANSLATIONFAILED; \
} \
ptrname[__l##ptrname] = 0;

#define MAKE_MULTIBYTE_FROMWIDEN_BESTFIT(ptrname, widestr, _nCharacters, _pCnt, codepage) \
CQuickBytes __qb##ptrname; \
int __l##ptrname; \
__l##ptrname = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, NULL, 0, NULL, NULL); \
if (__l##ptrname > MAKE_MAX_LENGTH) \
MAKE_TOOLONGACTION; \
ptrname = (LPUTF8) __qb##ptrname .AllocThrows(__l##ptrname+1); \
DWORD _pCnt = WszWideCharToMultiByte(codepage, 0, widestr, _nCharacters, ptrname, __l##ptrname, NULL, NULL); \
if (_pCnt == 0 && _nCharacters > 0) { \
MAKE_TRANSLATIONFAILED; \
} \
ptrname[__l##ptrname] = 0;

#define MAKE_ANSIPTR_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt) \
MAKE_MULTIBYTE_FROMWIDEN(ptrname, widestr, _nCharacters, _pCnt, CP_ACP)

const SIZE_T MaxSigned32BitDecString = ARRAY_SIZE("-2147483648") - 1;
const SIZE_T MaxUnsigned32BitDecString = ARRAY_SIZE("4294967295") - 1;
const SIZE_T MaxIntegerDecHexString = ARRAY_SIZE("-9223372036854775808") - 1;

const SIZE_T Max16BitHexString = ARRAY_SIZE("1234") - 1;
const SIZE_T Max32BitHexString = ARRAY_SIZE("12345678") - 1;
const SIZE_T Max64BitHexString = ARRAY_SIZE("1234567812345678") - 1;

Expand Down Expand Up @@ -410,77 +302,6 @@ inline WCHAR* FormatInteger(WCHAR* str, size_t strCount, const char* fmt, I v)
return str;
}

inline
LPWSTR DuplicateString(
LPCWSTR wszString,
size_t cchString)
{
STATIC_CONTRACT_NOTHROW;

LPWSTR wszDup = NULL;
if (wszString != NULL)
{
wszDup = new (nothrow) WCHAR[cchString + 1];
if (wszDup != NULL)
{
wcscpy_s(wszDup, cchString + 1, wszString);
}
}
return wszDup;
}

inline
LPWSTR DuplicateString(
LPCWSTR wszString)
{
STATIC_CONTRACT_NOTHROW;

if (wszString != NULL)
{
return DuplicateString(wszString, u16_strlen(wszString));
}
else
{
return NULL;
}
}

void DECLSPEC_NORETURN ThrowOutOfMemory();

inline
LPWSTR DuplicateStringThrowing(
LPCWSTR wszString,
size_t cchString)
{
STATIC_CONTRACT_THROWS;

if (wszString == NULL)
return NULL;

LPWSTR wszDup = DuplicateString(wszString, cchString);
if (wszDup == NULL)
ThrowOutOfMemory();

return wszDup;
}

inline
LPWSTR DuplicateStringThrowing(
LPCWSTR wszString)
{
STATIC_CONTRACT_THROWS;

if (wszString == NULL)
return NULL;

LPWSTR wszDup = DuplicateString(wszString);
if (wszDup == NULL)
ThrowOutOfMemory();

return wszDup;
}


//*****************************************************************************
// Placement new is used to new and object at an exact location. The pointer
// is simply returned to the caller without actually using the heap. The
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/pal/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ set(SOURCES
loader/module.cpp
locale/unicode.cpp
locale/unicodedata.cpp
locale/utf8.cpp
${CLR_SRC_NATIVE_DIR}/minipal/utf8.c
map/common.cpp
map/map.cpp
map/virtual.cpp
Expand Down
52 changes: 0 additions & 52 deletions src/coreclr/pal/src/include/pal/utf8.h

This file was deleted.

31 changes: 20 additions & 11 deletions src/coreclr/pal/src/locale/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Revision History:
#include "pal/palinternal.h"
#include "pal/dbgmsg.h"
#include "pal/file.h"
#include "pal/utf8.h"
#include <minipal/utf8.h>
#include "pal/cruntime.h"
#include "pal/stackstring.hpp"
#include "pal/unicodedata.h"
Expand Down Expand Up @@ -253,16 +253,20 @@ MultiByteToWideChar(
goto EXIT;
}

// Use UTF8ToUnicode on all systems, since it replaces
// invalid characters and Core Foundation doesn't do that.
if (CodePage == CP_UTF8 || CodePage == CP_ACP)
{
if (cbMultiByte <= -1)
if (cbMultiByte < 0)
cbMultiByte = strlen(lpMultiByteStr) + 1;

if (!lpWideCharStr || cchWideChar == 0)
retval = minipal_get_length_utf8_to_utf16(lpMultiByteStr, cbMultiByte, dwFlags);

if (lpWideCharStr)
{
cbMultiByte = strlen(lpMultiByteStr) + 1;
if (cchWideChar == 0) cchWideChar = retval;
retval = minipal_convert_utf8_to_utf16(lpMultiByteStr, cbMultiByte, (CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);
}

retval = UTF8ToUnicode(lpMultiByteStr, cbMultiByte, lpWideCharStr, cchWideChar, dwFlags);
goto EXIT;
}

Expand Down Expand Up @@ -338,15 +342,20 @@ WideCharToMultiByte(
defaultChar = *lpDefaultChar;
}

// Use UnicodeToUTF8 on all systems because we use
// UTF8ToUnicode in MultiByteToWideChar() on all systems.
if (CodePage == CP_UTF8 || CodePage == CP_ACP)
{
if (cchWideChar == -1)
{
if (cchWideChar < 0)
cchWideChar = PAL_wcslen(lpWideCharStr) + 1;

if (!lpMultiByteStr || cbMultiByte == 0)
retval = minipal_get_length_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, dwFlags);

if (lpMultiByteStr)
{
if (cbMultiByte == 0) cbMultiByte = retval;
retval = minipal_convert_utf16_to_utf8((CHAR16_T*)lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte, dwFlags);
}
retval = UnicodeToUTF8(lpWideCharStr, cchWideChar, lpMultiByteStr, cbMultiByte);

goto EXIT;
}

Expand Down
Loading

0 comments on commit 2f2aaae

Please sign in to comment.