Skip to content

Commit

Permalink
Add single char lazy loop support to simplified Regex code gen (#61698)
Browse files Browse the repository at this point in the history
* Reduce atomic single char lazy loops

* Add single char lazy loop support to simplified code gen
  • Loading branch information
stephentoub committed Nov 18, 2021
1 parent 4db3531 commit 9e986b5
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -1615,7 +1615,6 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
string endLoop = DefineLabel("EndLoop");
string startingPos = NextLocalName("startingRunTextPos");
string endingPos = NextLocalName("endingRunTextPos");
string crawlPos = NextLocalName("crawlPos");

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();
Expand All @@ -1629,7 +1628,12 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
EmitSingleCharAtomicLoop(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"int {endingPos} = runtextpos;");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}
if (node.M > 0)
{
writer.WriteLine($"{startingPos} += {node.M};");
Expand Down Expand Up @@ -1678,6 +1682,102 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
string? iterationCount = null;
string? maxIterations = null;
if (node.N != int.MaxValue)
{
iterationCount = NextLocalName("i");
maxIterations = NextLocalName("maxIterations");
writer.WriteLine($"int {iterationCount} = 0;");
writer.WriteLine($"int {maxIterations} = {node.N - node.M};");
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
string nextPos = NextLocalName("nextPos");
writer.WriteLine($"int {nextPos} = runtextpos;");

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
string endLoopLabel = DefineLabel("endLoop");
writer.WriteLine($"goto {endLoopLabel};");
writer.WriteLine();

// Backtracking section. Subsequent failures will jump to here.
string backtrackingLabel = DefineLabel("Backtrack");
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
using (EmitBlock(writer, $"if ({iterationCount} >= {maxIterations})"))
{
writer.WriteLine($"goto {doneLabel};");
}
writer.WriteLine($"{iterationCount}++;");
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.
writer.WriteLine($"runtextpos = {nextPos};");
LoadTextSpanLocal(writer);
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"{nextPos} = runtextpos;");

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
string originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

writer.WriteLine();
MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2122,7 +2122,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -2558,6 +2558,121 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
MarkLabel(endLoop);
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
LocalBuilder? iterationCount = null;
LocalBuilder? maxIterations = null;
if (node.N != int.MaxValue)
{
// int iterationCount = 0;
// int maxIterations = node.N - node.M;
iterationCount = DeclareInt32();
maxIterations = DeclareInt32();
Ldc(0);
Stloc(iterationCount);
Ldc(node.N - node.M);
Stloc(maxIterations);
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
LocalBuilder? crawlPos = null;
if (expressionHasCaptures)
{
// int crawlPos = base.Crawlpos();
crawlPos = DeclareInt32();
Ldthis();
Call(s_crawlposMethod);
Stloc(crawlPos);
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
// int nextPos = runtextpos;
LocalBuilder nextPos = DeclareInt32();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
// goto endLoopLabel;
Label endLoopLabel = DefineLabel();
BrFar(endLoopLabel);

// Backtracking section. Subsequent failures will jump to here.
Label backtrackingLabel = DefineLabel();
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos!);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
// if (iterationCount >= maxIterations) goto doneLabel;
Ldloc(iterationCount!);
Ldloc(maxIterations);
BgeFar(doneLabel);

// iterationCount++;
Ldloc(iterationCount!);
Ldc(1);
Add();
Stloc(iterationCount!);
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.

// runtextpos = nextPos;
// MatchSingleChar();
// nextpos = runtextpos;
Ldloc(nextPos);
Stloc(runtextposLocal);
LoadTextSpanLocal();
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
Label originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,17 +216,21 @@ private void MakeLoopAtomic()
{
switch (Type)
{
case Oneloop:
Type = Oneloopatomic;
case Oneloop or Notoneloop or Setloop:
// For loops, we simply change the Type to the atomic variant.
// Atomic greedy loops should consume as many values as they can.
Type += Oneloopatomic - Oneloop;
break;
case Notoneloop:
Type = Notoneloopatomic;

case Onelazy or Notonelazy or Setlazy:
// For lazy, we not only change the Type, we also lower the max number of iterations
// to the minimum number of iterations, as they should end up matching as little as possible.
Type += Oneloopatomic - Onelazy;
N = M;
break;

default:
#if DEBUG
Debug.Assert(Type == Setloop, $"Unexpected type: {TypeName}");
#endif
Type = Setloopatomic;
Debug.Fail($"Unexpected type: {Type}");
break;
}
}
Expand Down Expand Up @@ -445,11 +449,15 @@ private void EliminateEndingBacktracking()
{
switch (node.Type)
{
// {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes,
// e.g. [abc]* => (?>[abc]*)
// {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes, e.g. [abc]* => (?>[abc]*).
// And {One/Notone/Set}lazys can similarly be upgraded to be atomic, which really makes them into repeaters
// or even empty nodes.
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
node.MakeLoopAtomic();
break;

Expand Down Expand Up @@ -642,11 +650,14 @@ private RegexNode ReduceAtomic()
case Setloopatomic:
return child;

// If an atomic subexpression contains only a {one/notone/set}loop,
// If an atomic subexpression contains only a {one/notone/set}{loop/lazy},
// change it to be an {one/notone/set}loopatomic and remove the atomic node.
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
child.MakeLoopAtomic();
return child;

Expand Down Expand Up @@ -2229,11 +2240,14 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
supported = true;
break;

// Single character greedy loops are supported if they're either they're actually a repeater
// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
static bool AncestorsAllowBacktracking(RegexNode? node)
Expand All @@ -2257,12 +2271,6 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
}
break;

case Onelazy:
case Notonelazy:
case Setlazy:
supported = M == N || (Next != null && Next.Type == Atomic);
break;

// {Lazy}Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
case Loop:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
{
throw new Xunit.Sdk.EqualException(result2, result1);
}

Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
}

[Theory]
Expand Down Expand Up @@ -476,9 +474,12 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
[InlineData("(?:ab|cd|ae)f", "(?>ab|cd|ae)f")]
public void PatternsReduceDifferently(string pattern1, string pattern2)
{
var r1 = new Regex(pattern1);
var r2 = new Regex(pattern2);
Assert.NotEqual(GetRegexCodes(r1), GetRegexCodes(r2));
string result1 = GetRegexCodes(new Regex(pattern1));
string result2 = GetRegexCodes(new Regex(pattern2));
if (result1 == result2)
{
throw new Xunit.Sdk.EqualException(result2, result1);
}
}

[Theory]
Expand Down

0 comments on commit 9e986b5

Please sign in to comment.