@@ -515,27 +515,52 @@ private void EliminateEndingBacktracking()
515515 }
516516 }
517517
518- /// <summary>Whether this node is considered to be atomic based on its parent.</summary>
518+ /// <summary>Whether this node may be considered to be atomic based on its parent.</summary>
519519 /// <remarks>
520- /// This is used to determine whether additional atomic nodes may be valuable to
521- /// be introduced into the tree. It should not be used to determine for sure whether
522- /// a node will be backtracked into.
520+ /// This may have false negatives, meaning the node may actually be atomic even if this returns false.
521+ /// But any true result may be relied on to mean the node will actually be considered to be atomic.
523522 /// </remarks>
524523 public bool IsAtomicByParent ( )
525524 {
526- RegexNode ? next = Next ;
527- if ( next is null ) return false ;
528- if ( next . Type == Atomic ) return true ;
529-
530- // We only walk up one group as a balance between optimization and cost.
531- if ( ( next . Type != Concatenate && next . Type != Capture ) ||
532- next . Child ( next . ChildCount ( ) - 1 ) != this )
525+ // Walk up the parent hierarchy.
526+ for ( RegexNode ? parent = Next ; parent is not null ; parent = parent . Next )
533527 {
534- return false ;
528+ switch ( parent . Type )
529+ {
530+ case Atomic :
531+ case Prevent :
532+ case Require :
533+ // If the parent is atomic, so is the child. That's the whole purpose
534+ // of the Atomic node, and lookarounds are also implicitly atomic.
535+ return true ;
536+
537+ case Alternate :
538+ case Testref :
539+ // Skip alternations. Each branch is considered independently,
540+ // so any atomicity applied to the alternation also applies to
541+ // each individual branch. This is true as well for conditional
542+ // backreferences, where each of the yes/no branches are independent.
543+ case Testgroup when parent . Child ( 0 ) != this :
544+ // As with alternations, each yes/no branch of an expression conditional
545+ // are independent from each other, but the conditional expression itself
546+ // can be backtracked into from each of the branches, so we can't make
547+ // it atomic just because the whole conditional is.
548+ case Capture :
549+ // Skip captures. They don't affect atomicity.
550+ case Concatenate when parent . Child ( parent . ChildCount ( ) - 1 ) == this :
551+ // If the parent is a concatenation and this is the last node,
552+ // any atomicity applying to the concatenation applies to this
553+ // node, too.
554+ continue ;
555+
556+ default :
557+ // For any other parent type, give up on trying to prove atomicity.
558+ return false ;
559+ }
535560 }
536561
537- next = next . Next ;
538- return next != null && next . Type == Atomic ;
562+ // The parent was null, so nothing can backtrack in.
563+ return true ;
539564 }
540565
541566 /// <summary>
@@ -2191,197 +2216,42 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
21912216 return false ;
21922217 }
21932218
2194- bool supported = false ;
2195-
2196- // We only support the default left-to-right, not right-to-left, which requires more complication in the generated code.
2197- // (Right-to-left is only employed when explicitly asked for by the developer or by lookbehind assertions.)
2198- // We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching
2199- // away from a recursive implementation (done for convenience) to an iterative one that's more complicated
2200- // but within the same problems.
2201- if ( ( Options & RegexOptions . RightToLeft ) == 0 )
2219+ if ( ( Options & RegexOptions . RightToLeft ) != 0 )
22022220 {
2203- int childCount = ChildCount ( ) ;
2204- Debug . Assert ( ( Options & HasCapturesFlag ) == 0 ) ;
2205-
2206- switch ( Type )
2207- {
2208- // One/Notone/Set/Multi don't involve any repetition and are easily supported.
2209- case One :
2210- case Notone :
2211- case Set :
2212- case Multi :
2213- // Boundaries are like set checks and don't involve repetition, either.
2214- case Boundary :
2215- case NonBoundary :
2216- case ECMABoundary :
2217- case NonECMABoundary :
2218- // Anchors are also trivial.
2219- case Beginning :
2220- case Start :
2221- case Bol :
2222- case Eol :
2223- case End :
2224- case EndZ :
2225- // {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops.
2226- // These consume their {Set/One} inputs as long as they match, and don't give up anything they
2227- // matched, which means we can support them without backtracking.
2228- case Oneloopatomic :
2229- case Notoneloopatomic :
2230- case Setloopatomic :
2231- // "Empty" is easy: nothing is emitted for it.
2232- // "Nothing" is also easy: it doesn't match anything.
2233- // "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine.
2234- case Empty :
2235- case Nothing :
2236- case UpdateBumpalong :
2237- // Backreferences are supported
2238- case Ref :
2239- supported = true ;
2240- break ;
2241-
2242- // Conditional backreference tests are also supported, so long as both their yes/no branches are supported.
2243- case Testref :
2244- supported =
2245- Child ( 0 ) . SupportsSimplifiedCodeGenerationImplementation ( ) &&
2246- ( childCount == 1 || Child ( 1 ) . SupportsSimplifiedCodeGenerationImplementation ( ) ) ;
2247- break ;
2248-
2249- // Single character greedy/lazy loops are supported if either they're actually a repeater
2250- // or they're not contained in any construct other than simple nesting (e.g. concat, capture).
2251- case Oneloop :
2252- case Notoneloop :
2253- case Setloop :
2254- case Onelazy :
2255- case Notonelazy :
2256- case Setlazy :
2257- Debug . Assert ( Next == null || Next . Type != Atomic , "Loop should have been transformed into an atomic type." ) ;
2258- supported = M == N || AncestorsAllowBacktracking ( Next ) ;
2259- break ;
2260-
2261- // For greedy and lazy loops, they're supported if the node they wrap is supported
2262- // and either the node is actually a repeater, is atomic, or is in the tree in a
2263- // location where backtracking is allowed.
2264- case Loop :
2265- case Lazyloop :
2266- supported =
2267- ( M == N || ( Next != null && Next . Type == Atomic ) || AncestorsAllowBacktracking ( Next ) ) &&
2268- Child ( 0 ) . SupportsSimplifiedCodeGenerationImplementation ( ) ;
2269- break ;
2270-
2271- // We can handle atomic as long as its child is supported.
2272- // Lookahead assertions also only require that the child node be supported.
2273- // The RightToLeft check earlier is important to differentiate lookbehind,
2274- // which is not supported.
2275- case Atomic :
2276- case Require :
2277- case Prevent :
2278- supported = Child ( 0 ) . SupportsSimplifiedCodeGenerationImplementation ( ) ;
2279- break ;
2280-
2281- // We can handle alternates as long as they're atomic (a root / global alternate is
2282- // effectively atomic, as nothing will try to backtrack into it as it's the last thing).
2283- // Its children must all also be supported.
2284- case Alternate :
2285- if ( Next != null &&
2286- ( IsAtomicByParent ( ) || // atomic alternate
2287- ( Next . Type == Capture && Next . Next is null ) ) ) // root alternate
2288- {
2289- goto case Concatenate ;
2290- }
2291- break ;
2292-
2293- // Concatenation doesn't require backtracking as long as its children don't.
2294- case Concatenate :
2295- supported = true ;
2296- for ( int i = 0 ; i < childCount ; i ++ )
2297- {
2298- if ( ! Child ( i ) . SupportsSimplifiedCodeGenerationImplementation ( ) )
2299- {
2300- supported = false ;
2301- break ;
2302- }
2303- }
2304- break ;
2305-
2306- case Capture :
2307- supported = Child ( 0 ) . SupportsSimplifiedCodeGenerationImplementation ( ) ;
2308- if ( supported )
2309- {
2310- // Captures are currently only supported in certain places in the tree.
2311- RegexNode ? parent = Next ;
2312- while ( parent != null )
2313- {
2314- switch ( parent . Type )
2315- {
2316- case Alternate :
2317- case Atomic :
2318- case Capture :
2319- case Concatenate :
2320- case Require :
2321- parent = parent . Next ;
2322- break ;
2323-
2324- default :
2325- parent = null ;
2326- supported = false ;
2327- break ;
2328- }
2329- }
2330-
2331- // If we've found a supported capture, mark all of the nodes in its parent
2332- // hierarchy as containing a capture.
2333- if ( supported )
2334- {
2335- parent = this ;
2336- while ( parent != null && ( ( parent . Options & HasCapturesFlag ) == 0 ) )
2337- {
2338- parent . Options |= HasCapturesFlag ;
2339- parent = parent . Next ;
2340- }
2341- }
2342- }
2343- break ;
2344-
2345- case Testgroup :
2346- supported =
2347- Child ( 0 ) . SupportsSimplifiedCodeGenerationImplementation ( ) &&
2348- Child ( 1 ) . SupportsSimplifiedCodeGenerationImplementation ( ) &&
2349- ( childCount == 2 || Child ( 2 ) . SupportsSimplifiedCodeGenerationImplementation ( ) ) ;
2350- break ;
2351-
2352- default :
2353- Debug . Fail ( $ "Unknown type: { Type } ") ;
2354- supported = false ;
2355- break ;
2356- }
2221+ // RightToLeft isn't supported. That applies to both the top-level options as well as when used
2222+ // to specify positive and negative lookbehinds.
2223+ return false ;
23572224 }
2358- #if DEBUG
2359- if ( ! supported && ( Options & RegexOptions . Debug ) != 0 )
2225+
2226+ // TODO: This should be moved somewhere else, to a pass somewhere where we explicitly
2227+ // annotate the tree, potentially as part of the final optimization pass. It doesn't
2228+ // belong in this check.
2229+ switch ( Type )
23602230 {
2361- Debug . WriteLine ( $ "Unable to use non-backtracking code gen: node { Description ( ) } isn't supported.") ;
2231+ case Capture :
2232+ // If we've found a supported capture, mark all of the nodes in its parent
2233+ // hierarchy as containing a capture.
2234+ RegexNode ? parent = this ;
2235+ while ( parent != null && ( ( parent . Options & HasCapturesFlag ) == 0 ) )
2236+ {
2237+ parent . Options |= HasCapturesFlag ;
2238+ parent = parent . Next ;
2239+ }
2240+ break ;
23622241 }
2363- #endif
2364- return supported ;
23652242
2366- static bool AncestorsAllowBacktracking ( RegexNode ? node )
2243+ int childCount = ChildCount ( ) ;
2244+ for ( int i = 0 ; i < childCount ; i ++ )
23672245 {
2368- while ( node is not null )
2246+ // The node isn't supported if any of its children aren't supported.
2247+ if ( ! Child ( i ) . SupportsSimplifiedCodeGenerationImplementation ( ) )
23692248 {
2370- switch ( node . Type )
2371- {
2372- case Concatenate :
2373- case Capture :
2374- case Atomic :
2375- node = node . Next ;
2376- break ;
2377-
2378- default :
2379- return false ;
2380- }
2249+ return false ;
23812250 }
2382-
2383- return true ;
23842251 }
2252+
2253+ // Supported.
2254+ return true ;
23852255 }
23862256
23872257 /// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
@@ -2393,15 +2263,19 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
23932263 /// <summary>Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node.</summary>
23942264 public bool IsNotoneFamily => Type is Notone or Notoneloop or Notoneloopatomic or Notonelazy ;
23952265
2396- /// <summary>Gets whether this node may be a source of backtracking.</summary>
2397- public bool InstigatesBacktracking =>
2398- Type switch
2266+ /// <summary>Gets whether this node is contained inside of a loop.</summary>
2267+ public bool IsInLoop ( )
2268+ {
2269+ for ( RegexNode ? parent = Next ; parent is not null ; parent = parent . Next )
23992270 {
2400- Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy or Loop or Lazyloop when ! IsAtomicByParent ( ) && M != N => true ,
2401- Alternate => ! IsAtomicByParent ( ) ,
2402- Ref or Testref or Testgroup => true ,
2403- _ => false ,
2404- } ;
2271+ if ( parent . Type is Loop or Lazyloop )
2272+ {
2273+ return true ;
2274+ }
2275+ }
2276+
2277+ return false ;
2278+ }
24052279
24062280 private string TypeName =>
24072281 Type switch
0 commit comments