From f27d33729518f5aa478aa818b7b4f54a4d50bef1 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 16 Feb 2021 21:12:02 +0000 Subject: [PATCH] Tighten bounds checks around TextEncoder logic - Replaces unsafe code with safe code where possible - Fixes some surrogate pairs being misinterpreted - Fixes https://github.com/dotnet/runtime/issues/45994 - Ref: MSRC 62749 (CVE-2021-26701) --- NuGet.config | 2 + eng/restore/harvestPackages.targets | 1 + .../Directory.Build.props | 3 + .../ref/System.Text.Encodings.Web.csproj | 1 + .../src/System.Text.Encodings.Web.csproj | 6 + .../src/System/IO/TextWriterExtensions.cs | 43 ++ .../System/Text/Encodings/Web/TextEncoder.cs | 481 ++++++++---------- .../src/System/Text/Unicode/UnicodeHelpers.cs | 167 ------ .../tests/AllowedCharsBitmapTests.cs | 2 +- .../tests/ConfigurableScalarTextEncoder.cs | 54 +- .../tests/HtmlEncoderTests.cs | 2 +- .../JavaScriptStringEncoderTests.Relaxed.cs | 2 +- .../tests/JavaScriptStringEncoderTests.cs | 2 +- .../tests/ScalarTestEncoder.cs | 20 +- .../System.Text.Encodings.Web.Tests.csproj | 3 + .../tests/TextEncoderBatteryTests.cs | 241 +++++++++ .../tests/TextEncoderTests.cs | 84 ++- .../tests/UnicodeEncoderBase.cs | 17 +- .../tests/UnicodeEncoderBaseTests.cs | 2 +- .../tests/UnicodeHelpersTests.cs | 64 +-- .../tests/UnicodeTestHelpers.cs | 22 + .../tests/UrlEncoderTests.cs | 2 +- src/libraries/libraries-packages.proj | 1 + src/libraries/pkg/baseline/packageIndex.json | 17 +- 24 files changed, 710 insertions(+), 529 deletions(-) create mode 100644 src/libraries/System.Text.Encodings.Web/src/System/IO/TextWriterExtensions.cs create mode 100644 src/libraries/System.Text.Encodings.Web/tests/TextEncoderBatteryTests.cs create mode 100644 src/libraries/System.Text.Encodings.Web/tests/UnicodeTestHelpers.cs diff --git a/NuGet.config b/NuGet.config index 272a9bec4fd23..34300a9188a76 100644 --- a/NuGet.config +++ b/NuGet.config @@ -16,6 +16,8 @@ + + diff --git a/eng/restore/harvestPackages.targets b/eng/restore/harvestPackages.targets index 2e9a8155be678..0d238bddc87ba 100644 --- a/eng/restore/harvestPackages.targets +++ b/eng/restore/harvestPackages.targets @@ -23,6 +23,7 @@ + <_OverridenPackageDownloads Include="@(_PackageDownload)" Condition="'@(PackageDownload)' == '@(_PackageDownload)' and %(Identity) != ''" /> <_PackageDownload Remove="@(_OverridenPackageDownloads)" /> <_PackageDownload Include="@(PackageDownload)" /> diff --git a/src/libraries/System.Text.Encodings.Web/Directory.Build.props b/src/libraries/System.Text.Encodings.Web/Directory.Build.props index bdcfca3b543cb..10888235eab33 100644 --- a/src/libraries/System.Text.Encodings.Web/Directory.Build.props +++ b/src/libraries/System.Text.Encodings.Web/Directory.Build.props @@ -1,6 +1,9 @@  + 5.0.0.1 + 5.0.1 + 4.5.1 Open \ No newline at end of file diff --git a/src/libraries/System.Text.Encodings.Web/ref/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/ref/System.Text.Encodings.Web.csproj index eff1dfbb548ca..24b4be39e3164 100644 --- a/src/libraries/System.Text.Encodings.Web/ref/System.Text.Encodings.Web.csproj +++ b/src/libraries/System.Text.Encodings.Web/ref/System.Text.Encodings.Web.csproj @@ -17,6 +17,7 @@ + \ No newline at end of file diff --git a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj index 906f6d9dc252a..dd490865d205c 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj +++ b/src/libraries/System.Text.Encodings.Web/src/System.Text.Encodings.Web.csproj @@ -10,6 +10,7 @@ $(NoWarn);CS3019 + @@ -40,6 +41,7 @@ + @@ -51,8 +53,12 @@ + + + + diff --git a/src/libraries/System.Text.Encodings.Web/src/System/IO/TextWriterExtensions.cs b/src/libraries/System.Text.Encodings.Web/src/System/IO/TextWriterExtensions.cs new file mode 100644 index 0000000000000..c2ace13699a48 --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/src/System/IO/TextWriterExtensions.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; + +#if !(NETCOREAPP || NETSTANDARD2_1) +using System.Buffers; +#endif + +namespace System.IO +{ + internal static class TextWriterExtensions + { + /// + /// Writes a partial string (given offset and count) to the underlying TextWriter. + /// + public static void WritePartialString(this TextWriter writer, string value, int offset, int count) + { + Debug.Assert(writer != null); + Debug.Assert(value != null); + + if (offset == 0 && count == value.Length) + { + // on all platforms, prefer TextWriter.Write(string) if no slicing is required + writer.Write(value); + } + else + { + // if slicing is required, call TextWriter.Write(ROS) if available; + // otherwise rent an array and implement the Write routine ourselves + ReadOnlySpan sliced = value.AsSpan(offset, count); +#if NETCOREAPP || NETSTANDARD2_1 + writer.Write(sliced); +#else + char[] rented = ArrayPool.Shared.Rent(sliced.Length); + sliced.CopyTo(rented); + writer.Write(rented, 0, sliced.Length); + ArrayPool.Shared.Return(rented); +#endif + } + } + } +} diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs index d8c228e79202f..91902c84117a9 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Encodings/Web/TextEncoder.cs @@ -26,6 +26,8 @@ namespace System.Text.Encodings.Web /// public abstract class TextEncoder { + private const int EncodeStartingOutputBufferSize = 1024; // bytes or chars, depending + // Fast cache for Ascii private readonly byte[][] _asciiEscape = new byte[0x80][]; @@ -107,154 +109,47 @@ public virtual string Encode(string value) throw new ArgumentNullException(nameof(value)); } - unsafe - { - fixed (char* valuePointer = value) - { - int firstCharacterToEncode = FindFirstCharacterToEncode(valuePointer, value.Length); - - if (firstCharacterToEncode == -1) - { - return value; - } - - int bufferSize = MaxOutputCharactersPerInputCharacter * value.Length; - - string result; - if (bufferSize < 1024) - { - char* wholebuffer = stackalloc char[bufferSize]; - OperationStatus status = EncodeIntoBuffer(wholebuffer, bufferSize, valuePointer, value.Length, out int _, out int totalWritten, firstCharacterToEncode); - if (status != OperationStatus.Done) - { - ThrowArgumentException_MaxOutputCharsPerInputChar(); - } - - result = new string(wholebuffer, 0, totalWritten); - } - else - { - char[] wholebuffer = new char[bufferSize]; - fixed (char* buffer = &wholebuffer[0]) - { - OperationStatus status = EncodeIntoBuffer(buffer, bufferSize, valuePointer, value.Length, out int _, out int totalWritten, firstCharacterToEncode); - if (status != OperationStatus.Done) - { - ThrowArgumentException_MaxOutputCharsPerInputChar(); - } - - result = new string(wholebuffer, 0, totalWritten); - } - } - - return result; - } - } - } - - private unsafe OperationStatus EncodeIntoBuffer( - char* buffer, - int bufferLength, - char* value, - int valueLength, - out int charsConsumed, - out int charsWritten, - int firstCharacterToEncode, - bool isFinalBlock = true) - { - Debug.Assert(value != null); - Debug.Assert(firstCharacterToEncode >= 0); - - char* originalBuffer = buffer; - charsWritten = 0; - - if (firstCharacterToEncode > 0) + int indexOfFirstCharToEncode = FindFirstCharacterToEncode(value.AsSpan()); + if (indexOfFirstCharToEncode < 0) { - Debug.Assert(firstCharacterToEncode <= valueLength); - Buffer.MemoryCopy(source: value, - destination: buffer, - destinationSizeInBytes: sizeof(char) * bufferLength, - sourceBytesToCopy: sizeof(char) * firstCharacterToEncode); - - charsWritten += firstCharacterToEncode; - bufferLength -= firstCharacterToEncode; - buffer += firstCharacterToEncode; + return value; // shortcut: there's no work to perform } - int valueIndex = firstCharacterToEncode; - - char firstChar = value[valueIndex]; - char secondChar = firstChar; - bool wasSurrogatePair = false; - - // this loop processes character pairs (in case they are surrogates). - // there is an if block below to process single last character. - int secondCharIndex; - for (secondCharIndex = valueIndex + 1; secondCharIndex < valueLength; secondCharIndex++) - { - if (!wasSurrogatePair) - { - firstChar = secondChar; - } - else - { - firstChar = value[secondCharIndex - 1]; - } - - secondChar = value[secondCharIndex]; - - if (!WillEncode(firstChar)) - { - wasSurrogatePair = false; - *buffer = firstChar; - buffer++; - bufferLength--; - charsWritten++; - } - else - { - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair, out bool _); - if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out int charsWrittenThisTime)) - { - charsConsumed = (int)(originalBuffer - buffer); - return OperationStatus.DestinationTooSmall; - } + ReadOnlySpan remainingInput = value.AsSpan(indexOfFirstCharToEncode); + ValueStringBuilder stringBuilder = new ValueStringBuilder(stackalloc char[EncodeStartingOutputBufferSize]); - if (wasSurrogatePair) - { - secondCharIndex++; - } +#if !NETCOREAPP + // Can't call string.Concat later in the method, so memcpy now. + stringBuilder.Append(value.AsSpan(0, indexOfFirstCharToEncode)); +#endif - buffer += charsWrittenThisTime; - bufferLength -= charsWrittenThisTime; - charsWritten += charsWrittenThisTime; - } - } + // On each iteration of the main loop, we'll make sure we have at least this many chars left in the + // destination buffer. This should prevent us from making very chatty calls where we only make progress + // one char at a time. + int minBufferBumpEachIteration = Math.Max(MaxOutputCharactersPerInputCharacter, EncodeStartingOutputBufferSize); - if (secondCharIndex == valueLength) + do { - firstChar = value[valueLength - 1]; - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair, out bool needMoreData); - if (!isFinalBlock && needMoreData) - { - Debug.Assert(wasSurrogatePair == false); - charsConsumed = (int)(buffer - originalBuffer); - return OperationStatus.NeedMoreData; - } - - if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out int charsWrittenThisTime)) + // AppendSpan mutates the VSB length to include the newly-added span. This potentially overallocates. + Span destBuffer = stringBuilder.AppendSpan(Math.Max(remainingInput.Length, minBufferBumpEachIteration)); + Encode(remainingInput, destBuffer, out int charsConsumedJustNow, out int charsWrittenJustNow); + if (charsWrittenJustNow == 0 || (uint)charsWrittenJustNow > (uint)destBuffer.Length) { - charsConsumed = (int)(buffer - originalBuffer); - return OperationStatus.DestinationTooSmall; + ThrowArgumentException_MaxOutputCharsPerInputChar(); // couldn't make forward progress or returned bogus data } + remainingInput = remainingInput.Slice(charsConsumedJustNow); + // It's likely we didn't populate the entire span. If this is the case, adjust the VSB length + // to reflect that there's unused buffer at the end of the VSB instance. + stringBuilder.Length -= destBuffer.Length - charsWrittenJustNow; + } while (!remainingInput.IsEmpty); - buffer += charsWrittenThisTime; - bufferLength -= charsWrittenThisTime; - charsWritten += charsWrittenThisTime; - } - - charsConsumed = valueLength; - return OperationStatus.Done; +#if NETCOREAPP + string retVal = string.Concat(value.AsSpan(0, indexOfFirstCharToEncode), stringBuilder.AsSpan()); + stringBuilder.Dispose(); + return retVal; +#else + return stringBuilder.ToString(); +#endif } /// @@ -286,37 +181,18 @@ public virtual void Encode(TextWriter output, string value, int startIndex, int } ValidateRanges(startIndex, characterCount, actualInputLength: value.Length); - unsafe + int indexOfFirstCharToEncode = FindFirstCharacterToEncode(value.AsSpan(startIndex, characterCount)); + if (indexOfFirstCharToEncode < 0) { - fixed (char* valuePointer = value) - { - char* substring = valuePointer + startIndex; - int firstIndexToEncode = FindFirstCharacterToEncode(substring, characterCount); - - if (firstIndexToEncode == -1) // nothing to encode; - { - if (startIndex == 0 && characterCount == value.Length) // write whole string - { - output.Write(value); - return; - } - for (int i = 0; i < characterCount; i++) // write substring - { - output.Write(*substring); - substring++; - } - return; - } + indexOfFirstCharToEncode = characterCount; + } - // write prefix, then encode - for (int i = 0; i < firstIndexToEncode; i++) - { - output.Write(*substring); - substring++; - } + // memcpy all characters that don't require encoding, then encode any remaining chars - EncodeCore(output, substring, characterCount - firstIndexToEncode); - } + output.WritePartialString(value, startIndex, indexOfFirstCharToEncode); + if (indexOfFirstCharToEncode != characterCount) + { + Encode(output, value.AsSpan(startIndex + indexOfFirstCharToEncode, characterCount - indexOfFirstCharToEncode)); } } @@ -339,37 +215,16 @@ public virtual void Encode(TextWriter output, char[] value, int startIndex, int } ValidateRanges(startIndex, characterCount, actualInputLength: value.Length); - unsafe + int indexOfFirstCharToEncode = FindFirstCharacterToEncode(value.AsSpan(startIndex, characterCount)); + if (indexOfFirstCharToEncode < 0) { - fixed (char* valuePointer = value) - { - char* substring = valuePointer + startIndex; - int firstIndexToEncode = FindFirstCharacterToEncode(substring, characterCount); - - if (firstIndexToEncode == -1) // nothing to encode; - { - if (startIndex == 0 && characterCount == value.Length) // write whole string - { - output.Write(value); - return; - } - for (int i = 0; i < characterCount; i++) // write substring - { - output.Write(*substring); - substring++; - } - return; - } - - // write prefix, then encode - for (int i = 0; i < firstIndexToEncode; i++) - { - output.Write(*substring); - substring++; - } + indexOfFirstCharToEncode = characterCount; + } + output.Write(value, startIndex, indexOfFirstCharToEncode); - EncodeCore(output, substring, characterCount - firstIndexToEncode); - } + if (indexOfFirstCharToEncode != characterCount) + { + Encode(output, value.AsSpan(startIndex + indexOfFirstCharToEncode, characterCount - indexOfFirstCharToEncode)); } } @@ -584,99 +439,185 @@ public virtual OperationStatus Encode( out int charsWritten, bool isFinalBlock = true) { - unsafe + if (source.IsEmpty) { - fixed (char* sourcePtr = source) - { - int firstCharacterToEncode; - if (source.IsEmpty || (firstCharacterToEncode = FindFirstCharacterToEncode(sourcePtr, source.Length)) == -1) - { - if (source.TryCopyTo(destination)) - { - charsConsumed = source.Length; - charsWritten = source.Length; - return OperationStatus.Done; - } + // There's nothing to do. + charsConsumed = 0; + charsWritten = 0; + return OperationStatus.Done; + } - charsConsumed = 0; - charsWritten = 0; - return OperationStatus.DestinationTooSmall; - } - else if (destination.IsEmpty) - { - // Guards against passing a null destinationPtr to EncodeIntoBuffer (pinning an empty Span will return a null pointer). - charsConsumed = 0; - charsWritten = 0; - return OperationStatus.DestinationTooSmall; - } + // The Encode method is intended to be called in a loop, potentially where the source buffer + // is much larger than the destination buffer. We don't want to walk the entire source buffer + // on each invocation of this method, so we'll slice the source buffer to be no larger than + // the destination buffer to avoid performing unnecessary work. The potential exists for us to + // split the source in the middle of a UTF-16 surrogate pair. If this happens, + // FindFirstCharacterToEncode will report the split surrogate as "needs encoding", we'll fall + // back down the slow path, and the slow path will handle the surrogate appropriately. - fixed (char* destinationPtr = destination) - { - return EncodeIntoBuffer(destinationPtr, destination.Length, sourcePtr, source.Length, out charsConsumed, out charsWritten, firstCharacterToEncode, isFinalBlock); - } - } + ReadOnlySpan sourceSearchSpace = source; + if (destination.Length < source.Length) + { + sourceSearchSpace = source.Slice(0, destination.Length); } - } - private unsafe void EncodeCore(TextWriter output, char* value, int valueLength) - { - Debug.Assert(value != null && output != null); - Debug.Assert(valueLength >= 0); + int idxOfFirstCharToEncode = FindFirstCharacterToEncode(sourceSearchSpace); + if (idxOfFirstCharToEncode < 0) + { + idxOfFirstCharToEncode = sourceSearchSpace.Length; + } + + source.Slice(0, idxOfFirstCharToEncode).CopyTo(destination); // memcpy data that doesn't need to be encoded + if (idxOfFirstCharToEncode == source.Length) + { + charsConsumed = source.Length; + charsWritten = source.Length; + return OperationStatus.Done; // memcopied all chars, nothing more to do + } - int bufferLength = MaxOutputCharactersPerInputCharacter; - char* buffer = stackalloc char[bufferLength]; + // If we got to this point, we couldn't memcpy the entire source buffer into the destination. + // Either the destination was too short or we found data that needs to be encoded. - char firstChar = *value; - char secondChar = firstChar; - bool wasSurrogatePair = false; - int charsWritten; + OperationStatus opStatus = EncodeCore(source.Slice(idxOfFirstCharToEncode), destination.Slice(idxOfFirstCharToEncode), out int remainingCharsConsumed, out int remainingCharsWritten, isFinalBlock); + charsConsumed = idxOfFirstCharToEncode + remainingCharsConsumed; + charsWritten = idxOfFirstCharToEncode + remainingCharsWritten; + return opStatus; - // this loop processes character pairs (in case they are surrogates). - // there is an if block below to process single last character. - int secondCharIndex; - for (secondCharIndex = 1; secondCharIndex < valueLength; secondCharIndex++) + OperationStatus EncodeCore(ReadOnlySpan source, Span destination, out int charsConsumed, out int charsWritten, bool isFinalBlock) { - if (!wasSurrogatePair) - { - firstChar = secondChar; - } - else - { - firstChar = value[secondCharIndex - 1]; - } - secondChar = value[secondCharIndex]; + Debug.Assert(!source.IsEmpty, "Caller should've handled fully-consumed source in fast path."); - if (!WillEncode(firstChar)) + if (destination.IsEmpty) { - wasSurrogatePair = false; - output.Write(firstChar); + destination = Array.Empty(); // normalize empty destination buffers to non-nullptr reference; TryEncodeUnicodeScalar requires this } - else + + int destinationOffset = 0; + int sourceOffset = 0; + while ((uint)sourceOffset < (uint)source.Length) { - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair, out bool _); - if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten)) + int scalarValue = source[sourceOffset]; + + if (!UnicodeUtility.IsSurrogateCodePoint((uint)scalarValue)) + { + if (!WillEncode(scalarValue)) + { + // single input char -> single output char (no escaping needed) + if ((uint)destinationOffset >= (uint)destination.Length) + { + goto DestinationTooSmall; + } + + destination[destinationOffset++] = (char)scalarValue; + sourceOffset++; + continue; + } + } + else { - ThrowArgumentException_MaxOutputCharsPerInputChar(); + uint firstCodePoint = (uint)scalarValue; + scalarValue = '\uFFFD'; // replacement char, just in case we can't read a full surrogate pair + if (UnicodeUtility.IsHighSurrogateCodePoint(firstCodePoint)) + { + int nextSourceIdx = sourceOffset + 1; + if ((uint)nextSourceIdx >= (uint)source.Length) + { + if (!isFinalBlock) + { + goto NeedMoreData; + } + } + else + { + uint nextCodePoint = source[nextSourceIdx]; + if (UnicodeUtility.IsLowSurrogateCodePoint(nextCodePoint)) + { + scalarValue = (int)UnicodeUtility.GetScalarFromUtf16SurrogatePair(firstCodePoint, nextCodePoint); + if (!WillEncode(scalarValue)) + { + // 2 input chars -> 2 output chars (no escaping needed) + if ((uint)(destinationOffset + 1) >= (uint)destination.Length) + { + goto DestinationTooSmall; + } + + destination[destinationOffset] = (char)firstCodePoint; + destination[destinationOffset + 1] = (char)nextCodePoint; + destinationOffset += 2; + sourceOffset += 2; + continue; + } + } + } + } } - Write(output, buffer, charsWritten); - if (wasSurrogatePair) + // If we got to this point, we need to encode. + + int numCharsWrittenJustNow; + unsafe { - secondCharIndex++; + fixed (char* pDest = &MemoryMarshal.GetReference(destination)) + { + Debug.Assert(pDest != null); // should've been handled on method entry + Debug.Assert((uint)destinationOffset <= (uint)destination.Length); + + if (!TryEncodeUnicodeScalar(scalarValue, pDest + destinationOffset, destination.Length - destinationOffset, out numCharsWrittenJustNow)) + { + goto DestinationTooSmall; + } + } } + + Debug.Assert(numCharsWrittenJustNow <= destination.Length - destinationOffset, "TryEncodeUnicodeScalar wrote past end of buffer?"); + sourceOffset += UnicodeUtility.GetUtf16SequenceLength((uint)scalarValue); + destinationOffset += numCharsWrittenJustNow; } + + OperationStatus retVal = OperationStatus.Done; + + ReturnCommon: + Debug.Assert(sourceOffset <= source.Length); + Debug.Assert(destinationOffset <= destination.Length); + charsConsumed = sourceOffset; + charsWritten = destinationOffset; + return retVal; + + NeedMoreData: + retVal = OperationStatus.NeedMoreData; + goto ReturnCommon; + + DestinationTooSmall: + retVal = OperationStatus.DestinationTooSmall; + goto ReturnCommon; } + } + + private void Encode(TextWriter output, ReadOnlySpan value) + { + Debug.Assert(output != null); + Debug.Assert(!value.IsEmpty, "Caller should've special-cased 'no encoding needed'."); - if (!wasSurrogatePair || (secondCharIndex == valueLength)) + // On each iteration of the main loop, we'll make sure we have at least this many chars left in the + // destination buffer. This should prevent us from making very chatty calls where we only make progress + // one char at a time. + int minBufferBumpEachIteration = Math.Max(MaxOutputCharactersPerInputCharacter, EncodeStartingOutputBufferSize); + char[] rentedArray = ArrayPool.Shared.Rent(Math.Max(value.Length, minBufferBumpEachIteration)); + Span scratchBuffer = rentedArray; + + do { - firstChar = value[valueLength - 1]; - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair, out bool _); - if (!TryEncodeUnicodeScalar(nextScalar, buffer, bufferLength, out charsWritten)) + Encode(value, scratchBuffer, out int charsConsumedJustNow, out int charsWrittenJustNow); + if (charsWrittenJustNow == 0 || (uint)charsWrittenJustNow > (uint)scratchBuffer.Length) { - ThrowArgumentException_MaxOutputCharsPerInputChar(); + ThrowArgumentException_MaxOutputCharsPerInputChar(); // couldn't make forward progress or returned bogus data } - Write(output, buffer, charsWritten); - } + + output.Write(rentedArray, 0, charsWrittenJustNow); // write char[], not Span, for best compat & performance + value = value.Slice(charsConsumedJustNow); + } while (!value.IsEmpty); + + ArrayPool.Shared.Return(rentedArray); } private unsafe int FindFirstCharacterToEncode(ReadOnlySpan text) @@ -945,17 +886,6 @@ private static void ValidateRanges(int startIndex, int characterCount, int actua } } - private static unsafe void Write(TextWriter output, char* input, int inputLength) - { - Debug.Assert(output != null && input != null && inputLength >= 0); - - while (inputLength-- > 0) - { - output.Write(*input); - input++; - } - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private byte[]? GetAsciiEncoding(byte value) { @@ -995,6 +925,7 @@ private unsafe void InitializeAsciiCache() } _bitMaskLookupAsciiNeedsEscaping = vector; + _isAsciiCacheInitialized = true; return; } #endif diff --git a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs index add7a295b6094..544a941017629 100644 --- a/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs +++ b/src/libraries/System.Text.Encodings.Web/src/System/Text/Unicode/UnicodeHelpers.cs @@ -15,11 +15,6 @@ namespace System.Text.Unicode /// internal static unsafe partial class UnicodeHelpers { - /// - /// Used for invalid Unicode sequences or other unrepresentable values. - /// - private const char UNICODE_REPLACEMENT_CHAR = '\uFFFD'; - /// /// The last code point defined by the Unicode specification. /// @@ -239,134 +234,6 @@ internal static ReadOnlySpan GetDefinedCharacterBitmap() } } - /// - /// Given a UTF-16 character stream, reads the next scalar value from the stream. - /// Set 'endOfString' to true if 'pChar' points to the last character in the stream. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair, out bool needsMoreData) - { - if (!char.IsSurrogate(first)) - { - wasSurrogatePair = false; - needsMoreData = false; - return first; - } - - return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair, out needsMoreData); - } - - private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair, out bool needMoreData) - { -#if DEBUG - if (!char.IsSurrogate(first)) - { - Debug.Assert(false, "This case should've been handled by the fast path."); - wasSurrogatePair = false; - needMoreData = false; - return first; - } -#endif - if (char.IsHighSurrogate(first)) - { - if (second != null) - { - if (char.IsLowSurrogate(second.Value)) - { - // valid surrogate pair - extract codepoint - wasSurrogatePair = true; - needMoreData = false; - return GetScalarValueFromUtf16SurrogatePair(first, second.Value); - } - else - { - // unmatched surrogate - substitute - wasSurrogatePair = false; - needMoreData = false; - return UNICODE_REPLACEMENT_CHAR; - } - } - else - { - // unmatched surrogate - substitute - wasSurrogatePair = false; - needMoreData = true; // Last character was high surrogate; we need more data. - return UNICODE_REPLACEMENT_CHAR; - } - } - else - { - // unmatched surrogate - substitute - Debug.Assert(char.IsLowSurrogate(first)); - wasSurrogatePair = false; - needMoreData = false; - return UNICODE_REPLACEMENT_CHAR; - } - } - - /// - /// Given a UTF-16 character stream, reads the next scalar value from the stream. - /// Set 'endOfString' to true if 'pChar' points to the last character in the stream. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int GetScalarValueFromUtf16(char* pChar, bool endOfString) - { - // This method is marked as AggressiveInlining to handle the common case of a non-surrogate - // character. The surrogate case is handled in the slower fallback code path. - char thisChar = *pChar; - return (char.IsSurrogate(thisChar)) ? GetScalarValueFromUtf16Slow(pChar, endOfString) : thisChar; - } - - private static int GetScalarValueFromUtf16Slow(char* pChar, bool endOfString) - { - char firstChar = pChar[0]; - - if (!char.IsSurrogate(firstChar)) - { - Debug.Assert(false, "This case should've been handled by the fast path."); - return firstChar; - } - else if (char.IsHighSurrogate(firstChar)) - { - if (endOfString) - { - // unmatched surrogate - substitute - return UNICODE_REPLACEMENT_CHAR; - } - else - { - char secondChar = pChar[1]; - if (char.IsLowSurrogate(secondChar)) - { - // valid surrogate pair - extract codepoint - return GetScalarValueFromUtf16SurrogatePair(firstChar, secondChar); - } - else - { - // unmatched surrogate - substitute - return UNICODE_REPLACEMENT_CHAR; - } - } - } - else - { - // unmatched surrogate - substitute - Debug.Assert(char.IsLowSurrogate(firstChar)); - return UNICODE_REPLACEMENT_CHAR; - } - } - - private static int GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate) - { - Debug.Assert(char.IsHighSurrogate(highSurrogate)); - Debug.Assert(char.IsLowSurrogate(lowSurrogate)); - - // See https://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the - // details of this conversion. We don't use Char.ConvertToUtf32 because its exception - // handling shows up on the hot path, and our caller has already sanitized the inputs. - return (lowSurrogate & 0x3ff) | (((highSurrogate & 0x3ff) + (1 << 6)) << 10); - } - internal static void GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate) { Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT); @@ -426,21 +293,6 @@ internal static int GetUtf8RepresentationForScalarValue(uint scalar) } } - /// - /// Returns a value stating whether a character is defined per the checked-in version - /// of the Unicode specification. Certain classes of characters (control chars, - /// private use, surrogates, some whitespace) are considered "undefined" for - /// our purposes. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static bool IsCharacterDefined(char c) - { - uint codePoint = (uint)c; - int index = (int)(codePoint >> 5); - int offset = (int)(codePoint & 0x1FU); - return ((GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0; - } - /// /// Determines whether the given scalar value is in the supplementary plane and thus /// requires 2 characters to be represented in UTF-16 (as a surrogate pair). @@ -450,24 +302,5 @@ internal static bool IsSupplementaryCodePoint(int scalar) { return ((scalar & ~((int)char.MaxValue)) != 0); } - - /// - /// Returns iff is a UTF-8 continuation byte; - /// i.e., has binary representation 10xxxxxx, where x is any bit. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static bool IsUtf8ContinuationByte(in byte value) - { - // This API takes its input as a readonly ref so that the JIT can emit "cmp ModRM" statements - // directly rather than bounce a temporary through a register. That is, we want the JIT to be - // able to emit a single "cmp byte ptr [data], C0h" statement if we're querying a memory location - // to see if it's a continuation byte. Data that's already enregistered will go through the - // normal "cmp reg, C0h" code paths, perhaps with some extra unnecessary "movzx" instructions. - // - // The below check takes advantage of the two's complement representation of negative numbers. - // [ 0b1000_0000, 0b1011_1111 ] is [ -127 (sbyte.MinValue), -65 ] - - return ((sbyte)value < -64); - } } } diff --git a/src/libraries/System.Text.Encodings.Web/tests/AllowedCharsBitmapTests.cs b/src/libraries/System.Text.Encodings.Web/tests/AllowedCharsBitmapTests.cs index 8e8b426c87c61..2dbf5b2f4c415 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/AllowedCharsBitmapTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/AllowedCharsBitmapTests.cs @@ -119,7 +119,7 @@ public void ForbidUndefinedCharacters_RemovesUndefinedChars() } else { - Assert.Equal(UnicodeHelpers.IsCharacterDefined((char)i), bitmap.IsCharacterAllowed((char)i)); + Assert.Equal(UnicodeTestHelpers.IsCharacterDefined((char)i), bitmap.IsCharacterAllowed((char)i)); } } } diff --git a/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs index e592f25c1441e..0f948fcff2405 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/ConfigurableScalarTextEncoder.cs @@ -17,9 +17,59 @@ public ConfigurableScalarTextEncoder(Predicate isScalarAllowed) _isScalarAllowed = isScalarAllowed; } - public override int MaxOutputCharactersPerInputCharacter => throw new NotImplementedException(); + public override int MaxOutputCharactersPerInputCharacter => 8; // "[10FFFF]".Length - public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) => throw new NotImplementedException(); + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + => FindFirstCharacterToEncode(new ReadOnlySpan(text, textLength)); + + private int FindFirstCharacterToEncode(ReadOnlySpan span) + { + int originalLength = span.Length; + + while (!span.IsEmpty) + { + if (!TryGetNextScalarValue(span, out int scalarValue) || !_isScalarAllowed(scalarValue)) + { + return originalLength - span.Length; // couldn't extract scalar or failed predicate + } + + span = span.Slice(UnicodeUtility.GetUtf16SequenceLength((uint)scalarValue)); + } + + return -1; // entire span was consumed + } + + private static bool TryGetNextScalarValue(ReadOnlySpan span, out int scalarValue) + { + if (!span.IsEmpty) + { + // non-surrogate char? + char firstChar = span[0]; + if (!char.IsSurrogate(firstChar)) + { + scalarValue = firstChar; + return true; + } + + // well-formed surrogate pair? + if (char.IsHighSurrogate(firstChar)) + { + if (span.Length > 1) + { + char secondChar = span[1]; + if (char.IsLowSurrogate(secondChar)) + { + scalarValue = char.ConvertToUtf32(firstChar, secondChar); + return true; + } + } + } + } + + // if we got to this point, span was empty or ill-formed surrogate found + scalarValue = default; + return false; + } public override bool WillEncode(int unicodeScalar) => !_isScalarAllowed(unicodeScalar); diff --git a/src/libraries/System.Text.Encodings.Web/tests/HtmlEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/HtmlEncoderTests.cs index 074bb3977cd30..36eff3f080a7e 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/HtmlEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/HtmlEncoderTests.cs @@ -140,7 +140,7 @@ public void HtmlEncode_AllRangesAllowed_StillEncodesForbiddenChars_Extended() { mustEncode = true; // control char } - else if (!UnicodeHelpers.IsCharacterDefined((char)i)) + else if (!UnicodeTestHelpers.IsCharacterDefined((char)i)) { mustEncode = true; // undefined (or otherwise disallowed) char } diff --git a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.Relaxed.cs b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.Relaxed.cs index b4a9f7d682f09..84b53ae56bcb9 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.Relaxed.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.Relaxed.cs @@ -129,7 +129,7 @@ public void JavaScriptStringEncode_Relaxed_StillEncodesForbiddenChars_Extended() { mustEncode = true; // control char } - else if (!UnicodeHelpers.IsCharacterDefined((char)i)) + else if (!UnicodeTestHelpers.IsCharacterDefined((char)i)) { mustEncode = true; // undefined (or otherwise disallowed) char } diff --git a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs index e918461f79b38..d88ab88b0c25b 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/JavaScriptStringEncoderTests.cs @@ -694,7 +694,7 @@ public void JavaScriptStringEncode_AllRangesAllowed_StillEncodesForbiddenChars_E { mustEncode = true; // control char } - else if (!UnicodeHelpers.IsCharacterDefined((char)i)) + else if (!UnicodeTestHelpers.IsCharacterDefined((char)i)) { mustEncode = true; // undefined (or otherwise disallowed) char } diff --git a/src/libraries/System.Text.Encodings.Web/tests/ScalarTestEncoder.cs b/src/libraries/System.Text.Encodings.Web/tests/ScalarTestEncoder.cs index 3481e50b31ace..2502a3d292ed1 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/ScalarTestEncoder.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/ScalarTestEncoder.cs @@ -3,8 +3,6 @@ using System; using System.Globalization; -using System.IO; -using System.Runtime.CompilerServices; namespace System.Text.Encodings.Web.Tests { @@ -20,7 +18,7 @@ public sealed class ScalarTestEncoder : TextEncoder /// public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) { - return text == null ? -1 : 0; + return (textLength == 0) ? -1 : 0; } /// @@ -44,12 +42,16 @@ public override int MaxOutputCharactersPerInputCharacter /// public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) { - fixed (char* chars = unicodeScalar.ToString("X8")) - for (int i = 0; i < Int32Length; i++) - buffer[i] = chars[i]; - - numberOfCharactersWritten = Int32Length; - return true; + if (unicodeScalar.ToString("X8", CultureInfo.InvariantCulture).AsSpan().TryCopyTo(new Span(buffer, bufferLength))) + { + numberOfCharactersWritten = Int32Length; + return true; + } + else + { + numberOfCharactersWritten = 0; + return false; + } } } } diff --git a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj index 5ad0343d6c04e..ae495e75a0c09 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj +++ b/src/libraries/System.Text.Encodings.Web/tests/System.Text.Encodings.Web.Tests.csproj @@ -14,6 +14,7 @@ + @@ -38,6 +39,7 @@ + @@ -54,6 +56,7 @@ Link="Common\System\HexConverter.cs" /> + diff --git a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderBatteryTests.cs b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderBatteryTests.cs new file mode 100644 index 0000000000000..5a8f3e6a2df4d --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderBatteryTests.cs @@ -0,0 +1,241 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using Xunit; + +namespace System.Text.Encodings.Web.Tests +{ + public class TextEncoderBatteryTests + { + private static TextEncoder GetBatteryTextEncoder() + { + // only even-valued scalars are allowed; odd-valued scalars are disallowed + return new ConfigurableScalarTextEncoder(scalarValue => scalarValue % 2 == 0); + } + + // 2 elements: [0] = input data (string), [1] = expected output data (string) + public static IEnumerable TestData() + { + static IEnumerable<(string input, string output)> RealTestData() + { + yield return ("", ""); + yield return ("xyz", "x[0079]z"); + yield return ("bdf", "bdf"); + yield return ("bdfbdfbdfbdfbdf", "bdfbdfbdfbdfbdf"); + yield return ("\U0001F600" /* grinning face */, "\U0001F600"); // not escaped since scalar value is even + yield return ("\U0001F601" /* grinning face with smiling eyes */, "[1F601]"); // escaped since scalar value is odd + yield return ("\U0001F3C0\U0001F3C1\U0001F3C2\U0001F3C3\U0001F3C4" /* various sports emoji */, + "\U0001F3C0[1F3C1]\U0001F3C2[1F3C3]\U0001F3C4"); + yield return ("bd\ud800fh", "bd[FFFD]fh"); // standalone high surrogate char + yield return ("bd\udffffh", "bd[FFFD]fh"); // standalone low surrogate char + yield return ("bd\ue000fh", "bd\ue000fh"); + yield return ("bd\ue001fh", "bd[E001]fh"); + yield return ("bd\udfd0\ud83c\udfd0\ud83cfh", "bd[FFFD]\U0001F3D0[FFFD]fh"); // U+1F3D0 VOLLEYBALL + yield return ("bd\udfd1\ud83c\udfd1\ud83cfh", "bd[FFFD][1F3D1][FFFD]fh"); // U+1F3D1 FIELD HOCKEY STICK AND BALL + yield return ("\ufffd\ud800\ufffd", "[FFFD][FFFD][FFFD]"); // U+FFFD is escaped since is odd + yield return ("xyz\ud800", "x[0079]z[FFFD]"); // ends with standalone high surrogate char + yield return ("xyz\udfff", "x[0079]z[FFFD]"); // ends with standalone low surrogate char + yield return ("xyz\U0001F3C0", "x[0079]z\U0001F3C0"); // ends with valid surrogate pair + + // really long input which does not need to be escaped + { + StringBuilder sb = new StringBuilder(); + + for (int i = 0x40; i < 0x4000; i += 2) + { + sb.Append((char)i); + } + + yield return (sb.ToString(), sb.ToString()); + } + + // really long input which needs to be escaped + { + StringBuilder sbInput = new StringBuilder(); + StringBuilder sbOutput = new StringBuilder(); + + for (int i = 0x40; i < 0x4000; i++) + { + sbInput.Append((char)i); + if (i % 2 == 0) + { + sbOutput.Append((char)i); + } + else + { + sbOutput.AppendFormat(CultureInfo.InvariantCulture, "[{0:X4}]", i); + } + } + + yield return (sbInput.ToString(), sbOutput.ToString()); + } + + // really long input which contains surrogate chars (no escape needed) + // also offset everything by 1 to account for the TextEncoder inner loop's + // "needs more data" handling logic. + { + StringBuilder sb = new StringBuilder(); + + for (int i = 0x10000; i < 0x14000; i += 2) + { + sb.Append(char.ConvertFromUtf32(i)); + } + + yield return (sb.ToString(), sb.ToString()); + yield return ("x" + sb.ToString(), "x" + sb.ToString()); + } + } + + foreach ((string input, string output) in RealTestData()) + { + yield return new[] { Escape(input), Escape(output) }; + } + } + + [Theory] + [MemberData(nameof(TestData))] + public void Encode_String(string input, string expectedOutput) + { + input = Unescape(input); + expectedOutput = Unescape(expectedOutput); + + // Arrange + + TextEncoder encoder = GetBatteryTextEncoder(); + + // Act + + string actualOutput = encoder.Encode(input); + + // Assert + + Assert.Equal(expectedOutput, actualOutput); + } + + [Theory] + [MemberData(nameof(TestData))] + public void Encode_TextWriter_String(string input, string expectedOutput) + { + input = Unescape(input); + expectedOutput = Unescape(expectedOutput); + + // Arrange + + TextEncoder encoder = GetBatteryTextEncoder(); + StringWriter writer = new StringWriter(); + + // Act + + encoder.Encode(writer, input); + + // Assert + + Assert.Equal(expectedOutput, writer.ToString()); + } + + [Theory] + [MemberData(nameof(TestData))] + public void Encode_TextWriter_String_WithOffset(string input, string expectedOutput) + { + input = Unescape(input); + expectedOutput = Unescape(expectedOutput); + + // Arrange + + TextEncoder encoder = GetBatteryTextEncoder(); + StringWriter writer; + + // Act & assert - 1 + + writer = new StringWriter(); + encoder.Encode(writer, input, 0, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + + // Act & assert - 2 + + writer = new StringWriter(); + encoder.Encode(writer, "xxx" + input + "yyy", 3, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + + // Act & assert - 3 + + writer = new StringWriter(); + encoder.Encode(writer, "\ud800" + input + "\udfff", 1, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + } + + [Theory] + [MemberData(nameof(TestData))] + public void Encode_TextWriter_CharArray_WithOffset(string input, string expectedOutput) + { + input = Unescape(input); + expectedOutput = Unescape(expectedOutput); + + // Arrange + + TextEncoder encoder = GetBatteryTextEncoder(); + StringWriter writer; + + // Act & assert - 1 + + writer = new StringWriter(); + encoder.Encode(writer, input.ToCharArray(), 0, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + + // Act & assert - 2 + + writer = new StringWriter(); + encoder.Encode(writer, ("xxx" + input + "yyy").ToCharArray(), 3, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + + // Act & assert - 3 + + writer = new StringWriter(); + encoder.Encode(writer, ("\ud800" + input + "\udfff").ToCharArray(), 1, input.Length); + Assert.Equal(expectedOutput, writer.ToString()); + } + + /* + * ESCAPING & UNESCAPING + * ===================== + * + * The xunit runner doesn't like strings that contain malformed UTF-16 data. + * To smuggle malformed UTF-16 data across the test runner, we'll encode all surrogate + * chars (not supplementary chars) as @XXXX. A supplementary char is thus represented + * as @XXXX@YYYY (10 chars total) in the stream. + */ + + private static string Escape(string value) + { + value = value.Replace(@"@", @"@0040"); + StringBuilder sb = new StringBuilder(value.Length); + foreach (char ch in value) + { + sb.Append(char.IsSurrogate(ch) ? FormattableString.Invariant($@"@{(int)ch:X4}") : ch); + } + return sb.ToString(); + } + + private static string Unescape(string value) + { + StringBuilder sb = new StringBuilder(value.Length); + for (int i = 0; i < value.Length; i++) + { + char ch = value[i]; + if (ch != '@') + { + sb.Append(ch); + } + else + { + sb.Append((char)ushort.Parse(value.Substring(i + 1, 4), NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture)); + i += 4; + } + } + return sb.ToString(); + } + } +} diff --git a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs index e841e746c505a..7c49984450456 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/TextEncoderTests.cs @@ -179,7 +179,7 @@ public void EncodeUtf8_MixedInputWhichRequiresEncodingOrReplacement() { destination = new byte[destinationLength]; - Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); + Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); @@ -275,5 +275,87 @@ public void FindFirstCharToEncodeUtf8_IllFormedData_ReturnsIndexOfIllFormedSubse Assert.Equal(expectedIndex, actualIndex); } + + [Theory] + [InlineData("", 0, "", 0, OperationStatus.Done)] + [InlineData("", 20, "", 0, OperationStatus.Done)] + [InlineData("ABC", 0, "", 0, OperationStatus.DestinationTooSmall)] + [InlineData("ABC", 2, "AB", 2, OperationStatus.DestinationTooSmall)] + [InlineData("ABC", 3, "ABC", 3, OperationStatus.Done)] + [InlineData("ABC", 30, "ABC", 3, OperationStatus.Done)] + [InlineData("ABC+DEF", 3, "ABC", 3, OperationStatus.DestinationTooSmall)] + [InlineData("ABC+DEF", 8, "ABC", 3, OperationStatus.DestinationTooSmall)] + [InlineData("ABC+DEF", 9, "ABC[002B]", 4, OperationStatus.DestinationTooSmall)] + [InlineData("ABC+DEF", 12, "ABC[002B]DEF", 7, OperationStatus.Done)] + public void EncodeUtf16_OperationStatus_AlphaNumericOnly(string input, int destBufferSize, string expectedOutput, int expectedCharsConsumed, OperationStatus expectedResult) + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(scalar => UnicodeUtility.IsInRangeInclusive((uint)scalar | 0x20, 'a', 'z')); // allow only [A-Za-z] unescaped + using BoundedMemory boundedInput = BoundedMemory.AllocateFromExistingData(input.AsSpan()); + using BoundedMemory boundedOutput = BoundedMemory.Allocate(destBufferSize); + + // Act + + OperationStatus actualResult = encoder.Encode(boundedInput.Span, boundedOutput.Span, out int actualCharsConsumed, out int actualCharsWritten); + + // Assert + + Assert.Equal(expectedResult, actualResult); + Assert.Equal(expectedCharsConsumed, actualCharsConsumed); + Assert.Equal(expectedOutput, boundedOutput.Span.Slice(0, actualCharsWritten).ToString()); + } + + [Theory] + [InlineData("ABC\U0001F600", 4, "ABC", 3, OperationStatus.DestinationTooSmall)] // don't allow breaking across a surrogate + [InlineData("ABC\U0001F600", 5, "ABC\U0001F600", 5, OperationStatus.Done)] + public void EncodeUtf16_OperationStatus_AllowEverything(string input, int destBufferSize, string expectedOutput, int expectedCharsConsumed, OperationStatus expectedResult) + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(_ => true); // allow all well-formed scalars + using BoundedMemory boundedInput = BoundedMemory.AllocateFromExistingData(input.AsSpan()); + using BoundedMemory boundedOutput = BoundedMemory.Allocate(destBufferSize); + + // Act + + OperationStatus actualResult = encoder.Encode(boundedInput.Span, boundedOutput.Span, out int actualCharsConsumed, out int actualCharsWritten); + + // Assert + + Assert.Equal(expectedResult, actualResult); + Assert.Equal(expectedCharsConsumed, actualCharsConsumed); + Assert.Equal(expectedOutput, boundedOutput.Span.Slice(0, actualCharsWritten).ToString()); + } + + [Theory] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 2, true, "AB", 2, OperationStatus.DestinationTooSmall)] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 2, false, "AB", 2, OperationStatus.NeedMoreData)] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 3, true, "AB", 2, OperationStatus.DestinationTooSmall)] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 3, false, "AB", 2, OperationStatus.NeedMoreData)] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 10, true, "AB[FFFD]", 3, OperationStatus.Done)] + [InlineData(new[] { 'A', 'B', '\ud83d' }, 10, false, "AB", 2, OperationStatus.NeedMoreData)] + [InlineData(new[] { 'A', 'B', '\ud83d', '\ude00' }, 2, true, "AB", 2, OperationStatus.DestinationTooSmall)] + [InlineData(new[] { 'A', 'B', '\ud83d', '\ude00' }, 2, false, "AB", 2, OperationStatus.DestinationTooSmall)] + [InlineData(new[] { 'A', 'B', '\ud83d', '\ude00' }, 4, true, "AB\U0001F600", 4, OperationStatus.Done)] + [InlineData(new[] { 'A', 'B', '\ud83d', '\ude00' }, 4, false, "AB\U0001F600", 4, OperationStatus.Done)] + public void EncodeUtf16_OperationStatus_SurrogateHandlingEdgeCases(char[] input, int destBufferSize, bool isFinalBlock, string expectedOutput, int expectedCharsConsumed, OperationStatus expectedResult) + { + // Arrange + + var encoder = new ConfigurableScalarTextEncoder(_ => true); // allow all well-formed scalars + using BoundedMemory boundedInput = BoundedMemory.AllocateFromExistingData(input); + using BoundedMemory boundedOutput = BoundedMemory.Allocate(destBufferSize); + + // Act + + OperationStatus actualResult = encoder.Encode(boundedInput.Span, boundedOutput.Span, out int actualCharsConsumed, out int actualCharsWritten, isFinalBlock); + + // Assert + + Assert.Equal(expectedResult, actualResult); + Assert.Equal(expectedCharsConsumed, actualCharsConsumed); + Assert.Equal(expectedOutput, boundedOutput.Span.Slice(0, actualCharsWritten).ToString()); + } } } diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBase.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBase.cs index 7e9a30b6aa184..77bb8410c74bb 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBase.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBase.cs @@ -190,7 +190,22 @@ private void EncodeCore(ref Writer writer, char* input, uint charsRemaining) { while (charsRemaining != 0) { - int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(input, endOfString: (charsRemaining == 1)); + int nextScalar = 0xFFFD; // Unicode replacement char + + char nextChar = input[0]; + if (!char.IsSurrogate(nextChar)) + { + nextScalar = nextChar; + } + else if (char.IsHighSurrogate(nextChar) && charsRemaining > 1) + { + char followingChar = input[1]; + if (char.IsLowSurrogate(followingChar)) + { + nextScalar = char.ConvertToUtf32(nextChar, followingChar); + } + } + if (UnicodeHelpers.IsSupplementaryCodePoint(nextScalar)) { // Supplementary characters should always be encoded numerically. diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBaseTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBaseTests.cs index c72cc67b668ba..1db46b53e03e8 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBaseTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeEncoderBaseTests.cs @@ -89,7 +89,7 @@ public void Encode_AllRangesAllowed_StillEncodesForbiddenChars_Extended() { mustEncode = true; // control char } - else if (!UnicodeHelpers.IsCharacterDefined((char)i)) + else if (!UnicodeTestHelpers.IsCharacterDefined((char)i)) { mustEncode = true; // undefined (or otherwise disallowed) char } diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs index 7b7f115640174..a20703f50241a 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeHelpersTests.cs @@ -20,68 +20,6 @@ public unsafe class UnicodeHelpersTests private static readonly UTF8Encoding _utf8EncodingThrowOnInvalidBytes = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); - // To future refactorers: - // The following GetScalarValueFromUtf16_* tests must not be done as a [Theory]. If done via [InlineData], the invalid - // code points will get sanitized with replacement characters before they even reach the test, as the strings are parsed - // from the attributes in reflection. And if done via [MemberData], the XmlWriter used by xunit will throw exceptions - // when it attempts to write out the test arguments, due to the invalid text. - - [Fact] - public void GetScalarValueFromUtf16_NormalBMPChar_EndOfString() - { - GetScalarValueFromUtf16("a", 'a'); - } - - [Fact] - public void GetScalarValueFromUtf16_NormalBMPChar_NotEndOfString() - { - GetScalarValueFromUtf16("ab", 'a'); - } - - [Fact] - public void GetScalarValueFromUtf16_TrailingSurrogate_EndOfString() - { - GetScalarValueFromUtf16("\uDFFF", UnicodeReplacementChar); - } - - [Fact] - public void GetScalarValueFromUtf16_TrailingSurrogate_NotEndOfString() - { - GetScalarValueFromUtf16("\uDFFFx", UnicodeReplacementChar); - } - - [Fact] - public void GetScalarValueFromUtf16_LeadingSurrogate_EndOfString() - { - GetScalarValueFromUtf16("\uD800", UnicodeReplacementChar); - } - - [Fact] - public void GetScalarValueFromUtf16_LeadingSurrogate_NotEndOfString() - { - GetScalarValueFromUtf16("\uD800x", UnicodeReplacementChar); - } - - [Fact] - public void GetScalarValueFromUtf16_LeadingSurrogate_NotEndOfString_FollowedByLeadingSurrogate() - { - GetScalarValueFromUtf16("\uD800\uD800", UnicodeReplacementChar); - } - - [Fact] - public void GetScalarValueFromUtf16_LeadingSurrogate_NotEndOfString_FollowedByTrailingSurrogate() - { - GetScalarValueFromUtf16("\uD800\uDFFF", 0x103FF); - } - - private void GetScalarValueFromUtf16(string input, int expectedResult) - { - fixed (char* pInput = input) - { - Assert.Equal(expectedResult, UnicodeHelpers.GetScalarValueFromUtf16(pInput, endOfString: (input.Length == 1))); - } - } - [Fact] public void GetUtf8RepresentationForScalarValue() { @@ -111,7 +49,7 @@ public void GetUtf8RepresentationForScalarValue() [Fact] public void IsCharacterDefined() { - Assert.All(ReadListOfDefinedCharacters().Select((defined, idx) => new { defined, idx }), c => Assert.Equal(c.defined, UnicodeHelpers.IsCharacterDefined((char)c.idx))); + Assert.All(ReadListOfDefinedCharacters().Select((defined, idx) => new { defined, idx }), c => Assert.Equal(c.defined, UnicodeTestHelpers.IsCharacterDefined((char)c.idx))); } private static bool[] ReadListOfDefinedCharacters() diff --git a/src/libraries/System.Text.Encodings.Web/tests/UnicodeTestHelpers.cs b/src/libraries/System.Text.Encodings.Web/tests/UnicodeTestHelpers.cs new file mode 100644 index 0000000000000..e69d7156b2221 --- /dev/null +++ b/src/libraries/System.Text.Encodings.Web/tests/UnicodeTestHelpers.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.Unicode +{ + internal static class UnicodeTestHelpers + { + /// + /// Returns a value stating whether a character is defined per the checked-in version + /// of the Unicode specification. Certain classes of characters (control chars, + /// private use, surrogates, some whitespace) are considered "undefined" for + /// our purposes. + /// + internal static bool IsCharacterDefined(char c) + { + uint codePoint = (uint)c; + int index = (int)(codePoint >> 5); + int offset = (int)(codePoint & 0x1FU); + return ((UnicodeHelpers.GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0; + } + } +} diff --git a/src/libraries/System.Text.Encodings.Web/tests/UrlEncoderTests.cs b/src/libraries/System.Text.Encodings.Web/tests/UrlEncoderTests.cs index 46bb504eaeec6..70fc8ad62c10f 100644 --- a/src/libraries/System.Text.Encodings.Web/tests/UrlEncoderTests.cs +++ b/src/libraries/System.Text.Encodings.Web/tests/UrlEncoderTests.cs @@ -114,7 +114,7 @@ public void UrlEncode_AllRangesAllowed_StillEncodesForbiddenChars() } else if ((0x00A0 <= i && i <= 0xD7FF) | (0xF900 <= i && i <= 0xFDCF) | (0xFDF0 <= i && i <= 0xFFEF)) { - mustEncode = !UnicodeHelpers.IsCharacterDefined((char)i); // 'ucschar' + mustEncode = !UnicodeTestHelpers.IsCharacterDefined((char)i); // 'ucschar' } else { diff --git a/src/libraries/libraries-packages.proj b/src/libraries/libraries-packages.proj index 4bd094c79f66a..4cba0746c5bfc 100644 --- a/src/libraries/libraries-packages.proj +++ b/src/libraries/libraries-packages.proj @@ -23,6 +23,7 @@ + diff --git a/src/libraries/pkg/baseline/packageIndex.json b/src/libraries/pkg/baseline/packageIndex.json index d129922956ab9..49111acbfe0d4 100644 --- a/src/libraries/pkg/baseline/packageIndex.json +++ b/src/libraries/pkg/baseline/packageIndex.json @@ -6440,21 +6440,28 @@ "4.3.1", "4.4.0", "4.5.0", - "4.6.0", - "5.0.0" + "4.5.1", + "4.7.1", + "4.7.2", + "5.0.0", + "5.0.1" ], - "BaselineVersion": "5.0.0", + "BaselineVersion": "5.0.1", "InboxOn": { "netcoreapp3.0": "4.0.4.0", - "net5.0": "5.0.0.0" + "net5.0": "5.0.0.1" }, "AssemblyVersionInPackageVersion": { "4.0.0.0": "4.0.0", "4.0.1.0": "4.3.0", "4.0.2.0": "4.4.0", "4.0.3.0": "4.5.0", + "4.0.3.1": "4.5.1", "4.0.4.0": "4.6.0", - "5.0.0.0": "5.0.0" + "4.0.5.0": "4.7.0", + "4.0.5.1": "4.7.2", + "5.0.0.0": "5.0.0", + "5.0.0.1": "5.0.1" } }, "System.Text.Json": {