Skip to content

Commit 45671cd

Browse files
committed
Rework timeouts and include in all phases
1 parent 239f550 commit 45671cd

3 files changed

Lines changed: 116 additions & 70 deletions

File tree

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Automata.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ internal sealed partial class SymbolicRegexMatcher<TSet>
8282
private int[]?[] _nfaDelta = Array.Empty<int[]>();
8383

8484
/// <summary>
85-
/// The transition function for <see cref="FindSubcaptures(ReadOnlySpan{char}, int, int, PerThreadData)"/>,
85+
/// The transition function for <see cref="FindSubcaptures(ReadOnlySpan{char}, int, long, int, PerThreadData)"/>,
8686
/// which is an NFA mode with additional state to track capture start and end positions.
8787
/// Each entry is an array of pairs of target state and effects to be applied when taking the transition.
8888
/// If the entry is null then the transition has not been computed yet.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs

Lines changed: 95 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -390,10 +390,10 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
390390
matchStart = matchEnd < startat ?
391391
startat : (_pattern._info.ContainsLineAnchor, _pattern._info.ContainsSomeAnchor) switch
392392
{
393-
(true, true) => FindStartPosition<FullInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
394-
(true, false) => FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
395-
(false, true) => FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
396-
(false, false) => FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, matchStartLowBoundary, perThreadData),
393+
(true, true) => FindStartPosition<FullInputReader, FullNullabilityHandler>(input, matchEnd, timeoutOccursAt, matchStartLowBoundary, perThreadData),
394+
(true, false) => FindStartPosition<FullInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, timeoutOccursAt, matchStartLowBoundary, perThreadData),
395+
(false, true) => FindStartPosition<NoZAnchorInputReader, FullNullabilityHandler>(input, matchEnd, timeoutOccursAt, matchStartLowBoundary, perThreadData),
396+
(false, false) => FindStartPosition<NoZAnchorInputReader, NoAnchorsNullabilityHandler>(input, matchEnd, timeoutOccursAt, matchStartLowBoundary, perThreadData),
397397
};
398398
}
399399

@@ -409,8 +409,8 @@ public SymbolicMatch FindMatch(RegexRunnerMode mode, ReadOnlySpan<char> input, i
409409
else
410410
{
411411
Registers endRegisters = _pattern._info.ContainsLineAnchor ?
412-
FindSubcaptures<FullInputReader>(input, matchStart, matchEnd, perThreadData) :
413-
FindSubcaptures<NoZAnchorInputReader>(input, matchStart, matchEnd, perThreadData);
412+
FindSubcaptures<FullInputReader>(input, matchStart, timeoutOccursAt, matchEnd, perThreadData) :
413+
FindSubcaptures<NoZAnchorInputReader>(input, matchStart, timeoutOccursAt, matchEnd, perThreadData);
414414
return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
415415
}
416416
}
@@ -442,14 +442,9 @@ private int FindEndPosition<TInputReader, TFindOptimizationsHandler, TNullabilit
442442
while (true)
443443
{
444444
// Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked,
445-
// we need to pop out of the inner loop every now and then to do the timeout check in this outer loop. Note that
446-
// the timeout exists not to provide perfect guarantees around execution time but rather as a mitigation against
447-
// catastrophic backtracking. Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
448-
// still check the timeout now and again to provide some semblance of the behavior a developer experiences with
449-
// the backtracking engines. We can, however, choose a large number here, since it's not actually needed for security.
450-
const int CharsPerTimeoutCheck = 1_000;
451-
int innerLoopLength = _checkTimeout && input.Length - pos > CharsPerTimeoutCheck ?
452-
pos + CharsPerTimeoutCheck :
445+
// we need to pop out of the inner loop every now and then to do the timeout check in this outer loop.
446+
int innerLoopLength = _checkTimeout && input.Length - pos > SymbolicRegexThresholds.CharsPerTimeoutCheckStartEnd ?
447+
pos + SymbolicRegexThresholds.CharsPerTimeoutCheckStartEnd :
453448
input.Length;
454449

455450
bool done = currentState.NfaState is not null ?
@@ -591,10 +586,11 @@ private bool FindEndPositionDeltas<TStateHandler, TInputReader, TFindOptimizatio
591586
/// </remarks>
592587
/// <param name="input">The input text.</param>
593588
/// <param name="i">The ending position to walk backwards from. <paramref name="i"/> points one past the last character of the match.</param>
589+
/// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
594590
/// <param name="matchStartBoundary">The initial starting location discovered in phase 1, a point we must not walk earlier than.</param>
595591
/// <param name="perThreadData">Per thread data reused between calls.</param>
596592
/// <returns>The found starting position for the match.</returns>
597-
private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, int i, int matchStartBoundary, PerThreadData perThreadData)
593+
private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<char> input, int i, long timeoutOccursAt, int matchStartBoundary, PerThreadData perThreadData)
598594
where TInputReader : struct, IInputReader
599595
where TNullabilityHandler : struct, INullabilityHandler
600596
{
@@ -611,24 +607,40 @@ private int FindStartPosition<TInputReader, TNullabilityHandler>(ReadOnlySpan<ch
611607
// Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
612608
while (true)
613609
{
614-
// Run the DFA or NFA traversal backwards from the current point using the current state.
610+
// Run the DFA or NFA traversal backwards from the current point using the current state. If timeouts are being checked,
611+
// we need to pop out of the inner loop every now and then to do the timeout check in this outer loop.
612+
int innerLoopStartBoundary = _checkTimeout && i - matchStartBoundary > SymbolicRegexThresholds.CharsPerTimeoutCheckStartEnd ?
613+
i - SymbolicRegexThresholds.CharsPerTimeoutCheckStartEnd :
614+
matchStartBoundary;
615+
615616
bool done = currentState.NfaState is not null ?
616-
FindStartPositionDeltas<NfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart) :
617-
FindStartPositionDeltas<DfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, matchStartBoundary, ref currentState, ref lastStart);
617+
FindStartPositionDeltas<NfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, innerLoopStartBoundary, ref currentState, ref lastStart) :
618+
FindStartPositionDeltas<DfaStateHandler, TInputReader, TNullabilityHandler>(input, ref i, innerLoopStartBoundary, ref currentState, ref lastStart);
618619

619-
// If we found the starting position, we're done.
620-
if (done)
620+
// If the inner loop indicates that the search finished (for example due to reaching a deadend state) or
621+
// there is no more input available, then the whole search is done.
622+
if (done || i <= matchStartBoundary)
621623
{
622624
break;
623625
}
624626

625-
// We didn't find the starting position but we did exit out of the backwards traversal. That should only happen
626-
// if we were unable to transition, which should only happen if we were in DFA mode and exceeded our graph size.
627-
// Upgrade to NFA mode and continue.
628-
Debug.Assert(i >= matchStartBoundary);
629-
NfaMatchingState nfaState = perThreadData.NfaState;
630-
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
631-
currentState = new CurrentState(nfaState);
627+
// The search did not finish, so we either failed to transition (which should only happen if we were in DFA mode and
628+
// need to switch over to NFA mode) or ran out of input in the inner loop. Check if the inner loop still had more
629+
// input available.
630+
if (i > innerLoopStartBoundary)
631+
{
632+
// Because there was still more input available, a failure to transition in DFA mode must be the cause
633+
// of the early exit. Upgrade to NFA mode.
634+
NfaMatchingState nfaState = perThreadData.NfaState;
635+
nfaState.InitializeFrom(this, GetState(currentState.DfaStateId));
636+
currentState = new CurrentState(nfaState);
637+
}
638+
639+
// Check for a timeout before continuing.
640+
if (_checkTimeout)
641+
{
642+
CheckTimeout(timeoutOccursAt);
643+
}
632644
}
633645

634646
Debug.Assert(lastStart != -1, "We expected to find a starting position but didn't.");
@@ -663,18 +675,17 @@ private bool FindStartPositionDeltas<TStateHandler, TInputReader, TNullabilityHa
663675
lastStart = pos;
664676
}
665677

666-
// If we are past the start threshold or if the state is a dead end, bail; we should have already
667-
// found a valid starting location.
668-
if (pos <= startThreshold || flags.IsDeadend())
678+
// If the state is a dead end, bail; we should have already found a valid starting location.
679+
if (flags.IsDeadend())
669680
{
670681
Debug.Assert(lastStart != -1);
671682
return true;
672683
}
673684

674-
// Try to transition with the next character, the one before the current position.
675-
if (!TStateHandler.TryTakeTransition(this, ref state, positionId))
685+
// If we are not past the start threshold, try to transition with the next character, the one before the current position.
686+
if (pos <= startThreshold || !TStateHandler.TryTakeTransition(this, ref state, positionId))
676687
{
677-
// Return false to indicate the search didn't finish.
688+
// Return false to indicate the search possibly didn't finish.
678689
return false;
679690
}
680691

@@ -693,10 +704,11 @@ private bool FindStartPositionDeltas<TStateHandler, TInputReader, TNullabilityHa
693704
/// <summary>Run the pattern on a match to record the capture starts and ends.</summary>
694705
/// <param name="input">input span</param>
695706
/// <param name="i">inclusive start position</param>
707+
/// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
696708
/// <param name="iEnd">exclusive end position</param>
697709
/// <param name="perThreadData">Per thread data reused between calls.</param>
698710
/// <returns>the final register values, which indicate capture starts and ends</returns>
699-
private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i, int iEnd, PerThreadData perThreadData)
711+
private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i, long timeoutOccursAt, int iEnd, PerThreadData perThreadData)
700712
where TInputReader : struct, IInputReader
701713
{
702714
// Pick the correct start state based on previous character kind.
@@ -722,54 +734,68 @@ private Registers FindSubcaptures<TInputReader>(ReadOnlySpan<char> input, int i,
722734

723735
while ((uint)i < (uint)iEnd)
724736
{
725-
Debug.Assert(next.Count == 0);
737+
// If timeouts are being checked, we need to pop out of the inner loop every now and then to do the timeout check in this outer loop.
738+
int innerLoopEnd = _checkTimeout && iEnd - i > SymbolicRegexThresholds.CharsPerTimeoutCheckSubcaptures ?
739+
i + SymbolicRegexThresholds.CharsPerTimeoutCheckSubcaptures :
740+
iEnd;
726741

727-
// i is guaranteed to be within bounds, so the position ID is a minterm ID
728-
int mintermId = TInputReader.GetPositionId(this, input, i);
729-
730-
foreach ((int sourceId, Registers sourceRegisters) in current.Values)
742+
while ((uint)i < (uint)innerLoopEnd)
731743
{
732-
// Get or create the transitions
733-
int offset = DeltaOffset(sourceId, mintermId);
734-
(int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ??
735-
CreateNewCapturingTransition(sourceId, mintermId, offset);
744+
Debug.Assert(next.Count == 0);
745+
746+
// i is guaranteed to be within bounds, so the position ID is a minterm ID
747+
int mintermId = TInputReader.GetPositionId(this, input, i);
736748

737-
// Take the transitions in their prioritized order
738-
for (int j = 0; j < transitions.Length; ++j)
749+
foreach ((int sourceId, Registers sourceRegisters) in current.Values)
739750
{
740-
(int targetStateId, DerivativeEffect[] effects) = transitions[j];
751+
// Get or create the transitions
752+
int offset = DeltaOffset(sourceId, mintermId);
753+
(int, DerivativeEffect[])[] transitions = _capturingNfaDelta[offset] ??
754+
CreateNewCapturingTransition(sourceId, mintermId, offset);
741755

742-
// Try to add the state and handle the case where it didn't exist before. If the state already
743-
// exists, then the transition can be safely ignored, as the existing state was generated by a
744-
// higher priority transition.
745-
if (next.Add(targetStateId, out int index))
756+
// Take the transitions in their prioritized order
757+
for (int j = 0; j < transitions.Length; ++j)
746758
{
747-
// Avoid copying the registers on the last transition from this state, reusing the registers instead
748-
Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters;
749-
newRegisters.ApplyEffects(effects, i);
750-
next.Update(index, targetStateId, newRegisters);
759+
(int targetStateId, DerivativeEffect[] effects) = transitions[j];
751760

752-
int coreStateId = GetCoreStateId(targetStateId);
753-
StateFlags flags = _stateFlagsArray[coreStateId];
754-
Debug.Assert(!flags.IsDeadend());
755-
756-
if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind<TInputReader>(input, i + 1))))
761+
// Try to add the state and handle the case where it didn't exist before. If the state already
762+
// exists, then the transition can be safely ignored, as the existing state was generated by a
763+
// higher priority transition.
764+
if (next.Add(targetStateId, out int index))
757765
{
758-
// No lower priority transitions from this or other source states are taken because the
759-
// backtracking engines would return the match ending here.
760-
goto BreakNullable;
766+
// Avoid copying the registers on the last transition from this state, reusing the registers instead
767+
Registers newRegisters = j != transitions.Length - 1 ? sourceRegisters.Clone() : sourceRegisters;
768+
newRegisters.ApplyEffects(effects, i);
769+
next.Update(index, targetStateId, newRegisters);
770+
771+
int coreStateId = GetCoreStateId(targetStateId);
772+
StateFlags flags = _stateFlagsArray[coreStateId];
773+
Debug.Assert(!flags.IsDeadend());
774+
775+
if (flags.IsNullable() || (flags.CanBeNullable() && GetState(coreStateId).IsNullableFor(GetCharKind<TInputReader>(input, i + 1))))
776+
{
777+
// No lower priority transitions from this or other source states are taken because the
778+
// backtracking engines would return the match ending here.
779+
goto BreakNullable;
780+
}
761781
}
762782
}
763783
}
784+
785+
BreakNullable:
786+
// Swap the state sets and prepare for the next character
787+
SparseIntMap<Registers> tmp = current;
788+
current = next;
789+
next = tmp;
790+
next.Clear();
791+
i++;
764792
}
765793

766-
BreakNullable:
767-
// Swap the state sets and prepare for the next character
768-
SparseIntMap<Registers> tmp = current;
769-
current = next;
770-
next = tmp;
771-
next.Clear();
772-
i++;
794+
// Check for a timeout before continuing.
795+
if (_checkTimeout)
796+
{
797+
CheckTimeout(timeoutOccursAt);
798+
}
773799
}
774800

775801
Debug.Assert(current.Count > 0);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexThresholds.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,25 @@ internal static int GetSymbolicRegexSafeSizeThreshold()
5858
DefaultSymbolicRegexSafeSizeThreshold :
5959
safeSizeThresholdInt;
6060
}
61+
62+
/// <summary>
63+
/// The number of characters the match start and end finding phases loop over between timeout checks.
64+
/// </summary>
65+
/// <remarks>
66+
/// The timeout exists not to provide perfect guarantees around execution time but rather as a mitigation against
67+
/// catastrophic backtracking. Catastrophic backtracking is not an issue for the NonBacktracking engine, but we
68+
/// still check the timeout now and again to provide some semblance of the behavior a developer experiences with
69+
/// the backtracking engines. We can, however, choose a large number here, since it's not actually needed for security.
70+
/// </remarks>
71+
internal const int CharsPerTimeoutCheckStartEnd = 1_000;
72+
73+
/// <summary>
74+
/// The number of characters the subcapture finding phase loops over between timeout checks.
75+
/// </summary>
76+
/// <remarks>
77+
/// Finding subcaptures is slower than finding the start and end of the top level match. To give better accuracy this
78+
/// number is set lower than <see cref="CharsPerTimeoutCheckStartEnd"/>.
79+
/// </remarks>
80+
internal const int CharsPerTimeoutCheckSubcaptures = 100;
6181
}
6282
}

0 commit comments

Comments
 (0)