From 9e986b5704949f3d41d55a71738440645a06947f Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 18 Nov 2021 09:30:24 -0500 Subject: [PATCH] Add single char lazy loop support to simplified Regex code gen (#61698) * Reduce atomic single char lazy loops * Add single char lazy loop support to simplified code gen --- .../gen/RegexGenerator.Emitter.cs | 106 +++++++++++++++- .../Text/RegularExpressions/RegexCompiler.cs | 117 +++++++++++++++++- .../Text/RegularExpressions/RegexNode.cs | 44 ++++--- .../tests/RegexReductionTests.cs | 11 +- 4 files changed, 251 insertions(+), 27 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index fd8309602a0cf..87a1e7cf21213 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1198,7 +1198,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck case RegexNode.Onelazy: case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -1615,7 +1615,6 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL string endLoop = DefineLabel("EndLoop"); string startingPos = NextLocalName("startingRunTextPos"); string endingPos = NextLocalName("endingRunTextPos"); - string crawlPos = NextLocalName("crawlPos"); // We're about to enter a loop, so ensure our text position is 0. TransferTextSpanPosToRunTextPos(); @@ -1629,7 +1628,12 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL EmitSingleCharAtomicLoop(node); TransferTextSpanPosToRunTextPos(); writer.WriteLine($"int {endingPos} = runtextpos;"); - writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + string? crawlPos = null; + if (expressionHasCaptures) + { + crawlPos = NextLocalName("crawlPos"); + writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + } if (node.M > 0) { writer.WriteLine($"{startingPos} += {node.M};"); @@ -1678,6 +1682,102 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // It's left pointing to the backtracking label for everything subsequent in the expression. } + void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, + // as the lazy itself failed to match. + if (node.M > 0) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + + // If the whole thing was actually that repeater, we're done. + if (node.M == node.N) + { + return; + } + + Debug.Assert(node.M < node.N); + + // We now need to match one character at a time, each time allowing the remainder of the expression + // to try to match, and only matching another character if the subsequent expression fails to match. + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // If the loop isn't unbounded, track the number of iterations and the max number to allow. + string? iterationCount = null; + string? maxIterations = null; + if (node.N != int.MaxValue) + { + iterationCount = NextLocalName("i"); + maxIterations = NextLocalName("maxIterations"); + writer.WriteLine($"int {iterationCount} = 0;"); + writer.WriteLine($"int {maxIterations} = {node.N - node.M};"); + } + + // Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point. + string? crawlPos = null; + if (expressionHasCaptures) + { + crawlPos = NextLocalName("crawlPos"); + writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + } + + // Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which + // is also incremented each time we match another character in the loop. + string nextPos = NextLocalName("nextPos"); + writer.WriteLine($"int {nextPos} = runtextpos;"); + + // Skip the backtracking section for the initial subsequent matching. We've already matched the + // minimum number of iterations, which means we can successfully match with zero additional iterations. + string endLoopLabel = DefineLabel("endLoop"); + writer.WriteLine($"goto {endLoopLabel};"); + writer.WriteLine(); + + // Backtracking section. Subsequent failures will jump to here. + string backtrackingLabel = DefineLabel("Backtrack"); + MarkLabel(backtrackingLabel); + + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + if (expressionHasCaptures) + { + EmitUncaptureUntil(crawlPos); + } + + // If there's a max number of iterations, see if we've exceeded the maximum number of characters + // to match. If we haven't, increment the iteration count. + if (maxIterations is not null) + { + using (EmitBlock(writer, $"if ({iterationCount} >= {maxIterations})")) + { + writer.WriteLine($"goto {doneLabel};"); + } + writer.WriteLine($"{iterationCount}++;"); + } + + // Now match the next character in the lazy loop. We need to reset the runtextpos to the position + // just after the last character in this loop was matched, and we need to store the resulting position + // for the next time we backtrack. + writer.WriteLine($"runtextpos = {nextPos};"); + LoadTextSpanLocal(writer); + EmitSingleChar(node); + TransferTextSpanPosToRunTextPos(); + writer.WriteLine($"{nextPos} = runtextpos;"); + + // Update the done label for everything that comes after this node. This is done after we emit the single char + // matching, as that failing indicates the loop itself has failed to match. + string originalDoneLabel = doneLabel; + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + writer.WriteLine(); + MarkLabel(endLoopLabel); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 49b88d3954cab..3a57f1840516c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2122,7 +2122,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck case RegexNode.Onelazy: case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -2558,6 +2558,121 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL MarkLabel(endLoop); } + void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, + // as the lazy itself failed to match. + if (node.M > 0) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + + // If the whole thing was actually that repeater, we're done. + if (node.M == node.N) + { + return; + } + + Debug.Assert(node.M < node.N); + + // We now need to match one character at a time, each time allowing the remainder of the expression + // to try to match, and only matching another character if the subsequent expression fails to match. + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // If the loop isn't unbounded, track the number of iterations and the max number to allow. + LocalBuilder? iterationCount = null; + LocalBuilder? maxIterations = null; + if (node.N != int.MaxValue) + { + // int iterationCount = 0; + // int maxIterations = node.N - node.M; + iterationCount = DeclareInt32(); + maxIterations = DeclareInt32(); + Ldc(0); + Stloc(iterationCount); + Ldc(node.N - node.M); + Stloc(maxIterations); + } + + // Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point. + LocalBuilder? crawlPos = null; + if (expressionHasCaptures) + { + // int crawlPos = base.Crawlpos(); + crawlPos = DeclareInt32(); + Ldthis(); + Call(s_crawlposMethod); + Stloc(crawlPos); + } + + // Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which + // is also incremented each time we match another character in the loop. + // int nextPos = runtextpos; + LocalBuilder nextPos = DeclareInt32(); + Ldloc(runtextposLocal); + Stloc(nextPos); + + // Skip the backtracking section for the initial subsequent matching. We've already matched the + // minimum number of iterations, which means we can successfully match with zero additional iterations. + // goto endLoopLabel; + Label endLoopLabel = DefineLabel(); + BrFar(endLoopLabel); + + // Backtracking section. Subsequent failures will jump to here. + Label backtrackingLabel = DefineLabel(); + MarkLabel(backtrackingLabel); + + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + if (expressionHasCaptures) + { + EmitUncaptureUntil(crawlPos!); + } + + // If there's a max number of iterations, see if we've exceeded the maximum number of characters + // to match. If we haven't, increment the iteration count. + if (maxIterations is not null) + { + // if (iterationCount >= maxIterations) goto doneLabel; + Ldloc(iterationCount!); + Ldloc(maxIterations); + BgeFar(doneLabel); + + // iterationCount++; + Ldloc(iterationCount!); + Ldc(1); + Add(); + Stloc(iterationCount!); + } + + // Now match the next character in the lazy loop. We need to reset the runtextpos to the position + // just after the last character in this loop was matched, and we need to store the resulting position + // for the next time we backtrack. + + // runtextpos = nextPos; + // MatchSingleChar(); + // nextpos = runtextpos; + Ldloc(nextPos); + Stloc(runtextposLocal); + LoadTextSpanLocal(); + EmitSingleChar(node); + TransferTextSpanPosToRunTextPos(); + Ldloc(runtextposLocal); + Stloc(nextPos); + + // Update the done label for everything that comes after this node. This is done after we emit the single char + // matching, as that failing indicates the loop itself has failed to match. + Label originalDoneLabel = doneLabel; + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + MarkLabel(endLoopLabel); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index e7b0e71076df1..88a2800f8f96b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -216,17 +216,21 @@ private void MakeLoopAtomic() { switch (Type) { - case Oneloop: - Type = Oneloopatomic; + case Oneloop or Notoneloop or Setloop: + // For loops, we simply change the Type to the atomic variant. + // Atomic greedy loops should consume as many values as they can. + Type += Oneloopatomic - Oneloop; break; - case Notoneloop: - Type = Notoneloopatomic; + + case Onelazy or Notonelazy or Setlazy: + // For lazy, we not only change the Type, we also lower the max number of iterations + // to the minimum number of iterations, as they should end up matching as little as possible. + Type += Oneloopatomic - Onelazy; + N = M; break; + default: -#if DEBUG - Debug.Assert(Type == Setloop, $"Unexpected type: {TypeName}"); -#endif - Type = Setloopatomic; + Debug.Fail($"Unexpected type: {Type}"); break; } } @@ -445,11 +449,15 @@ private void EliminateEndingBacktracking() { switch (node.Type) { - // {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes, - // e.g. [abc]* => (?>[abc]*) + // {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes, e.g. [abc]* => (?>[abc]*). + // And {One/Notone/Set}lazys can similarly be upgraded to be atomic, which really makes them into repeaters + // or even empty nodes. case Oneloop: case Notoneloop: case Setloop: + case Onelazy: + case Notonelazy: + case Setlazy: node.MakeLoopAtomic(); break; @@ -642,11 +650,14 @@ private RegexNode ReduceAtomic() case Setloopatomic: return child; - // If an atomic subexpression contains only a {one/notone/set}loop, + // If an atomic subexpression contains only a {one/notone/set}{loop/lazy}, // change it to be an {one/notone/set}loopatomic and remove the atomic node. case Oneloop: case Notoneloop: case Setloop: + case Onelazy: + case Notonelazy: + case Setlazy: child.MakeLoopAtomic(); return child; @@ -2229,11 +2240,14 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() supported = true; break; - // Single character greedy loops are supported if they're either they're actually a repeater + // Single character greedy/lazy loops are supported if either they're actually a repeater // or they're not contained in any construct other than simple nesting (e.g. concat, capture). case Oneloop: case Notoneloop: case Setloop: + case Onelazy: + case Notonelazy: + case Setlazy: Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type."); supported = M == N || AncestorsAllowBacktracking(Next); static bool AncestorsAllowBacktracking(RegexNode? node) @@ -2257,12 +2271,6 @@ static bool AncestorsAllowBacktracking(RegexNode? node) } break; - case Onelazy: - case Notonelazy: - case Setlazy: - supported = M == N || (Next != null && Next.Type == Atomic); - break; - // {Lazy}Loop repeaters are the same, except their child also needs to be supported. // We also support such loops being atomic. case Loop: diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index fc62a3d56c664..d541964bfb6f0 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -410,8 +410,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) { throw new Xunit.Sdk.EqualException(result2, result1); } - - Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2))); } [Theory] @@ -476,9 +474,12 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) [InlineData("(?:ab|cd|ae)f", "(?>ab|cd|ae)f")] public void PatternsReduceDifferently(string pattern1, string pattern2) { - var r1 = new Regex(pattern1); - var r2 = new Regex(pattern2); - Assert.NotEqual(GetRegexCodes(r1), GetRegexCodes(r2)); + string result1 = GetRegexCodes(new Regex(pattern1)); + string result2 = GetRegexCodes(new Regex(pattern2)); + if (result1 == result2) + { + throw new Xunit.Sdk.EqualException(result2, result1); + } } [Theory]