Skip to content

Commit fd1a62c

Browse files
authored
Finish new code gen approach for RegexCompiler / source generator (#62268)
* Add an ifdef to compile one source generated test at a time * Change the ATT test theory to factor out the engine * Add a few tests and delete some unnecessary ones * Finish the new code gen strategy (all but RightToLeft) * Fix mono failures (hopefully)
1 parent 3810633 commit fd1a62c

6 files changed

Lines changed: 2347 additions & 1413 deletions

File tree

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 924 additions & 406 deletions
Large diffs are not rendered by default.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 971 additions & 451 deletions
Large diffs are not rendered by default.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 78 additions & 204 deletions
Original file line numberDiff line numberDiff line change
@@ -515,27 +515,52 @@ private void EliminateEndingBacktracking()
515515
}
516516
}
517517

518-
/// <summary>Whether this node is considered to be atomic based on its parent.</summary>
518+
/// <summary>Whether this node may be considered to be atomic based on its parent.</summary>
519519
/// <remarks>
520-
/// This is used to determine whether additional atomic nodes may be valuable to
521-
/// be introduced into the tree. It should not be used to determine for sure whether
522-
/// a node will be backtracked into.
520+
/// This may have false negatives, meaning the node may actually be atomic even if this returns false.
521+
/// But any true result may be relied on to mean the node will actually be considered to be atomic.
523522
/// </remarks>
524523
public bool IsAtomicByParent()
525524
{
526-
RegexNode? next = Next;
527-
if (next is null) return false;
528-
if (next.Type == Atomic) return true;
529-
530-
// We only walk up one group as a balance between optimization and cost.
531-
if ((next.Type != Concatenate && next.Type != Capture) ||
532-
next.Child(next.ChildCount() - 1) != this)
525+
// Walk up the parent hierarchy.
526+
for (RegexNode? parent = Next; parent is not null; parent = parent.Next)
533527
{
534-
return false;
528+
switch (parent.Type)
529+
{
530+
case Atomic:
531+
case Prevent:
532+
case Require:
533+
// If the parent is atomic, so is the child. That's the whole purpose
534+
// of the Atomic node, and lookarounds are also implicitly atomic.
535+
return true;
536+
537+
case Alternate:
538+
case Testref:
539+
// Skip alternations. Each branch is considered independently,
540+
// so any atomicity applied to the alternation also applies to
541+
// each individual branch. This is true as well for conditional
542+
// backreferences, where each of the yes/no branches are independent.
543+
case Testgroup when parent.Child(0) != this:
544+
// As with alternations, each yes/no branch of an expression conditional
545+
// are independent from each other, but the conditional expression itself
546+
// can be backtracked into from each of the branches, so we can't make
547+
// it atomic just because the whole conditional is.
548+
case Capture:
549+
// Skip captures. They don't affect atomicity.
550+
case Concatenate when parent.Child(parent.ChildCount() - 1) == this:
551+
// If the parent is a concatenation and this is the last node,
552+
// any atomicity applying to the concatenation applies to this
553+
// node, too.
554+
continue;
555+
556+
default:
557+
// For any other parent type, give up on trying to prove atomicity.
558+
return false;
559+
}
535560
}
536561

537-
next = next.Next;
538-
return next != null && next.Type == Atomic;
562+
// The parent was null, so nothing can backtrack in.
563+
return true;
539564
}
540565

541566
/// <summary>
@@ -2191,197 +2216,42 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
21912216
return false;
21922217
}
21932218

2194-
bool supported = false;
2195-
2196-
// We only support the default left-to-right, not right-to-left, which requires more complication in the generated code.
2197-
// (Right-to-left is only employed when explicitly asked for by the developer or by lookbehind assertions.)
2198-
// We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching
2199-
// away from a recursive implementation (done for convenience) to an iterative one that's more complicated
2200-
// but within the same problems.
2201-
if ((Options & RegexOptions.RightToLeft) == 0)
2219+
if ((Options & RegexOptions.RightToLeft) != 0)
22022220
{
2203-
int childCount = ChildCount();
2204-
Debug.Assert((Options & HasCapturesFlag) == 0);
2205-
2206-
switch (Type)
2207-
{
2208-
// One/Notone/Set/Multi don't involve any repetition and are easily supported.
2209-
case One:
2210-
case Notone:
2211-
case Set:
2212-
case Multi:
2213-
// Boundaries are like set checks and don't involve repetition, either.
2214-
case Boundary:
2215-
case NonBoundary:
2216-
case ECMABoundary:
2217-
case NonECMABoundary:
2218-
// Anchors are also trivial.
2219-
case Beginning:
2220-
case Start:
2221-
case Bol:
2222-
case Eol:
2223-
case End:
2224-
case EndZ:
2225-
// {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops.
2226-
// These consume their {Set/One} inputs as long as they match, and don't give up anything they
2227-
// matched, which means we can support them without backtracking.
2228-
case Oneloopatomic:
2229-
case Notoneloopatomic:
2230-
case Setloopatomic:
2231-
// "Empty" is easy: nothing is emitted for it.
2232-
// "Nothing" is also easy: it doesn't match anything.
2233-
// "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine.
2234-
case Empty:
2235-
case Nothing:
2236-
case UpdateBumpalong:
2237-
// Backreferences are supported
2238-
case Ref:
2239-
supported = true;
2240-
break;
2241-
2242-
// Conditional backreference tests are also supported, so long as both their yes/no branches are supported.
2243-
case Testref:
2244-
supported =
2245-
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
2246-
(childCount == 1 || Child(1).SupportsSimplifiedCodeGenerationImplementation());
2247-
break;
2248-
2249-
// Single character greedy/lazy loops are supported if either they're actually a repeater
2250-
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
2251-
case Oneloop:
2252-
case Notoneloop:
2253-
case Setloop:
2254-
case Onelazy:
2255-
case Notonelazy:
2256-
case Setlazy:
2257-
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
2258-
supported = M == N || AncestorsAllowBacktracking(Next);
2259-
break;
2260-
2261-
// For greedy and lazy loops, they're supported if the node they wrap is supported
2262-
// and either the node is actually a repeater, is atomic, or is in the tree in a
2263-
// location where backtracking is allowed.
2264-
case Loop:
2265-
case Lazyloop:
2266-
supported =
2267-
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
2268-
Child(0).SupportsSimplifiedCodeGenerationImplementation();
2269-
break;
2270-
2271-
// We can handle atomic as long as its child is supported.
2272-
// Lookahead assertions also only require that the child node be supported.
2273-
// The RightToLeft check earlier is important to differentiate lookbehind,
2274-
// which is not supported.
2275-
case Atomic:
2276-
case Require:
2277-
case Prevent:
2278-
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
2279-
break;
2280-
2281-
// We can handle alternates as long as they're atomic (a root / global alternate is
2282-
// effectively atomic, as nothing will try to backtrack into it as it's the last thing).
2283-
// Its children must all also be supported.
2284-
case Alternate:
2285-
if (Next != null &&
2286-
(IsAtomicByParent() || // atomic alternate
2287-
(Next.Type == Capture && Next.Next is null))) // root alternate
2288-
{
2289-
goto case Concatenate;
2290-
}
2291-
break;
2292-
2293-
// Concatenation doesn't require backtracking as long as its children don't.
2294-
case Concatenate:
2295-
supported = true;
2296-
for (int i = 0; i < childCount; i++)
2297-
{
2298-
if (!Child(i).SupportsSimplifiedCodeGenerationImplementation())
2299-
{
2300-
supported = false;
2301-
break;
2302-
}
2303-
}
2304-
break;
2305-
2306-
case Capture:
2307-
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
2308-
if (supported)
2309-
{
2310-
// Captures are currently only supported in certain places in the tree.
2311-
RegexNode? parent = Next;
2312-
while (parent != null)
2313-
{
2314-
switch (parent.Type)
2315-
{
2316-
case Alternate:
2317-
case Atomic:
2318-
case Capture:
2319-
case Concatenate:
2320-
case Require:
2321-
parent = parent.Next;
2322-
break;
2323-
2324-
default:
2325-
parent = null;
2326-
supported = false;
2327-
break;
2328-
}
2329-
}
2330-
2331-
// If we've found a supported capture, mark all of the nodes in its parent
2332-
// hierarchy as containing a capture.
2333-
if (supported)
2334-
{
2335-
parent = this;
2336-
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
2337-
{
2338-
parent.Options |= HasCapturesFlag;
2339-
parent = parent.Next;
2340-
}
2341-
}
2342-
}
2343-
break;
2344-
2345-
case Testgroup:
2346-
supported =
2347-
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
2348-
Child(1).SupportsSimplifiedCodeGenerationImplementation() &&
2349-
(childCount == 2 || Child(2).SupportsSimplifiedCodeGenerationImplementation());
2350-
break;
2351-
2352-
default:
2353-
Debug.Fail($"Unknown type: {Type}");
2354-
supported = false;
2355-
break;
2356-
}
2221+
// RightToLeft isn't supported. That applies to both the top-level options as well as when used
2222+
// to specify positive and negative lookbehinds.
2223+
return false;
23572224
}
2358-
#if DEBUG
2359-
if (!supported && (Options & RegexOptions.Debug) != 0)
2225+
2226+
// TODO: This should be moved somewhere else, to a pass somewhere where we explicitly
2227+
// annotate the tree, potentially as part of the final optimization pass. It doesn't
2228+
// belong in this check.
2229+
switch (Type)
23602230
{
2361-
Debug.WriteLine($"Unable to use non-backtracking code gen: node {Description()} isn't supported.");
2231+
case Capture:
2232+
// If we've found a supported capture, mark all of the nodes in its parent
2233+
// hierarchy as containing a capture.
2234+
RegexNode? parent = this;
2235+
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
2236+
{
2237+
parent.Options |= HasCapturesFlag;
2238+
parent = parent.Next;
2239+
}
2240+
break;
23622241
}
2363-
#endif
2364-
return supported;
23652242

2366-
static bool AncestorsAllowBacktracking(RegexNode? node)
2243+
int childCount = ChildCount();
2244+
for (int i = 0; i < childCount; i++)
23672245
{
2368-
while (node is not null)
2246+
// The node isn't supported if any of its children aren't supported.
2247+
if (!Child(i).SupportsSimplifiedCodeGenerationImplementation())
23692248
{
2370-
switch (node.Type)
2371-
{
2372-
case Concatenate:
2373-
case Capture:
2374-
case Atomic:
2375-
node = node.Next;
2376-
break;
2377-
2378-
default:
2379-
return false;
2380-
}
2249+
return false;
23812250
}
2382-
2383-
return true;
23842251
}
2252+
2253+
// Supported.
2254+
return true;
23852255
}
23862256

23872257
/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
@@ -2393,15 +2263,19 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
23932263
/// <summary>Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node.</summary>
23942264
public bool IsNotoneFamily => Type is Notone or Notoneloop or Notoneloopatomic or Notonelazy;
23952265

2396-
/// <summary>Gets whether this node may be a source of backtracking.</summary>
2397-
public bool InstigatesBacktracking =>
2398-
Type switch
2266+
/// <summary>Gets whether this node is contained inside of a loop.</summary>
2267+
public bool IsInLoop()
2268+
{
2269+
for (RegexNode? parent = Next; parent is not null; parent = parent.Next)
23992270
{
2400-
Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy or Loop or Lazyloop when !IsAtomicByParent() && M != N => true,
2401-
Alternate => !IsAtomicByParent(),
2402-
Ref or Testref or Testgroup => true,
2403-
_ => false,
2404-
};
2271+
if (parent.Type is Loop or Lazyloop)
2272+
{
2273+
return true;
2274+
}
2275+
}
2276+
2277+
return false;
2278+
}
24052279

24062280
private string TypeName =>
24072281
Type switch

0 commit comments

Comments
 (0)