diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index aa30b754bc1dcf..41f24fc3a24792 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -105,7 +105,7 @@ static uint ComputeStringHash(string s) /// Gets whether a given regular expression method is supported by the code generator. private static bool SupportsCodeGeneration(RegexMethod rm) { - RegexNode root = rm.Code.Tree.Root; + RegexNode root = rm.Tree.Root; if (!root.SupportsCompilation()) { @@ -170,7 +170,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri return ImmutableArray.Create(Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, rm.MethodSyntax.GetLocation())); } - AnalysisResults analysis = RegexTreeAnalyzer.Analyze(rm.Code); + AnalysisResults analysis = RegexTreeAnalyzer.Analyze(rm.Tree); writer.WriteLine($"new {id}();"); writer.WriteLine(); @@ -180,23 +180,23 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.WriteLine($" base.roptions = {optionsExpression};"); writer.WriteLine($" base.internalMatchTimeout = {timeoutExpression};"); writer.WriteLine($" base.factory = new RunnerFactory();"); - if (rm.Code.Caps is not null) + if (rm.Tree.CaptureNumberSparseMapping is not null) { writer.Write(" base.Caps = new global::System.Collections.Hashtable {"); - AppendHashtableContents(writer, rm.Code.Caps); + AppendHashtableContents(writer, rm.Tree.CaptureNumberSparseMapping); writer.WriteLine(" };"); } - if (rm.Code.Tree.CapNames is not null) + if (rm.Tree.CaptureNameToNumberMapping is not null) { writer.Write(" base.CapNames = new global::System.Collections.Hashtable {"); - AppendHashtableContents(writer, rm.Code.Tree.CapNames); + AppendHashtableContents(writer, rm.Tree.CaptureNameToNumberMapping); writer.WriteLine(" };"); } - if (rm.Code.Tree.CapsList is not null) + if (rm.Tree.CaptureNames is not null) { writer.Write(" base.capslist = new string[] {"); string separator = ""; - foreach (string s in rm.Code.Tree.CapsList) + foreach (string s in rm.Tree.CaptureNames) { writer.Write(separator); writer.Write(Literal(s)); @@ -204,7 +204,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri } writer.WriteLine(" };"); } - writer.WriteLine($" base.capsize = {rm.Code.CapSize};"); + writer.WriteLine($" base.capsize = {rm.Tree.CaptureCount};"); writer.WriteLine($" }}"); writer.WriteLine(" "); writer.WriteLine($" private sealed class RunnerFactory : global::System.Text.RegularExpressions.RegexRunnerFactory"); @@ -216,7 +216,7 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri // Main implementation methods writer.WriteLine(" // Description:"); - DescribeExpression(writer, rm.Code.Tree.Root.Child(0), " // ", analysis); // skip implicit root capture + DescribeExpression(writer, rm.Tree.Root.Child(0), " // ", analysis); // skip implicit root capture writer.WriteLine(); writer.WriteLine($" protected override void Scan(global::System.ReadOnlySpan text)"); @@ -365,7 +365,7 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(IndentedTextWriter writer, RegexMethod rm, string id) { RegexOptions options = (RegexOptions)rm.Options; - RegexCode code = rm.Code; + RegexTree regexTree = rm.Tree; bool hasTextInfo = false; RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None; @@ -384,7 +384,7 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I // Generate length check. If the input isn't long enough to possibly match, fail quickly. // It's rare for min required length to be 0, so we don't bother special-casing the check, // especially since we want the "return false" code regardless. - int minRequiredLength = rm.Code.Tree.MinRequiredLength; + int minRequiredLength = rm.Tree.FindOptimizations.MinRequiredLength; Debug.Assert(minRequiredLength >= 0); string clause = minRequiredLength switch { @@ -405,28 +405,28 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I EmitTextInfo(writer, ref hasTextInfo, rm); // Emit the code for whatever find mode has been determined. - switch (code.FindOptimizations.FindMode) + switch (regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); + Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf(regexTree.FindOptimizations.LeadingCaseSensitivePrefix); break; case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: - Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); EmitFixedSet(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: - Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); + Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null); EmitLiteralAfterAtomicLoop(); break; default: - Debug.Fail($"Unexpected mode: {code.FindOptimizations.FindMode}"); + Debug.Fail($"Unexpected mode: {regexTree.FindOptimizations.FindMode}"); goto case FindNextStartingPositionMode.NoSearch; case FindNextStartingPositionMode.NoSearch: @@ -455,7 +455,7 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I bool EmitAnchors() { // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. - switch (code.FindOptimizations.FindMode) + switch (regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: writer.WriteLine("// Beginning \\A anchor"); @@ -497,9 +497,9 @@ bool EmitAnchors() case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: // Jump to the end, minus the min required length, which in this case is actually the fixed length, minus 1 (for a possible ending \n). writer.WriteLine("// Trailing end \\Z anchor with fixed-length match"); - using (EmitBlock(writer, $"if (pos < end - {code.Tree.MinRequiredLength + 1})")) + using (EmitBlock(writer, $"if (pos < end - {regexTree.FindOptimizations.MinRequiredLength + 1})")) { - writer.WriteLine($"base.runtextpos = end - {code.Tree.MinRequiredLength + 1};"); + writer.WriteLine($"base.runtextpos = end - {regexTree.FindOptimizations.MinRequiredLength + 1};"); } writer.WriteLine("return true;"); return true; @@ -507,9 +507,9 @@ bool EmitAnchors() case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End: // Jump to the end, minus the min required length, which in this case is actually the fixed length. writer.WriteLine("// Trailing end \\z anchor with fixed-length match"); - using (EmitBlock(writer, $"if (pos < end - {code.Tree.MinRequiredLength})")) + using (EmitBlock(writer, $"if (pos < end - {regexTree.FindOptimizations.MinRequiredLength})")) { - writer.WriteLine($"base.runtextpos = end - {code.Tree.MinRequiredLength};"); + writer.WriteLine($"base.runtextpos = end - {regexTree.FindOptimizations.MinRequiredLength};"); } writer.WriteLine("return true;"); return true; @@ -517,7 +517,7 @@ bool EmitAnchors() // Now handle anchors that boost the position but may not determine immediate success or failure. - switch (code.FindOptimizations.LeadingAnchor) + switch (regexTree.FindOptimizations.LeadingAnchor) { case RegexNodeKind.Bol: // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike @@ -539,9 +539,9 @@ bool EmitAnchors() break; } - switch (code.FindOptimizations.TrailingAnchor) + switch (regexTree.FindOptimizations.TrailingAnchor) { - case RegexNodeKind.End when code.FindOptimizations.MaxPossibleLength is int maxLength: + case RegexNodeKind.End when regexTree.FindOptimizations.MaxPossibleLength is int maxLength: writer.WriteLine("// End \\z anchor with maximum-length match"); using (EmitBlock(writer, $"if (pos < end - {maxLength})")) { @@ -550,7 +550,7 @@ bool EmitAnchors() writer.WriteLine(); break; - case RegexNodeKind.EndZ when code.FindOptimizations.MaxPossibleLength is int maxLength: + case RegexNodeKind.EndZ when regexTree.FindOptimizations.MaxPossibleLength is int maxLength: writer.WriteLine("// End \\Z anchor with maximum-length match"); using (EmitBlock(writer, $"if (pos < end - {maxLength + 1})")) { @@ -578,7 +578,7 @@ void EmitIndexOf(string prefix) // and potentially other sets at other fixed positions in the pattern. void EmitFixedSet() { - List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets; + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = regexTree.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; const int MaxSets = 4; int setsToUse = Math.Min(sets.Count, MaxSets); @@ -693,8 +693,8 @@ void EmitFixedSet() // Emits a search for a literal following a leading atomic single-character loop. void EmitLiteralAfterAtomicLoop() { - Debug.Assert(code.FindOptimizations.LiteralAfterLoop is not null); - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = code.FindOptimizations.LiteralAfterLoop.Value; + Debug.Assert(regexTree.FindOptimizations.LiteralAfterLoop is not null); + (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = regexTree.FindOptimizations.LiteralAfterLoop.Value; Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic); Debug.Assert(target.LoopNode.N == int.MaxValue); @@ -751,13 +751,13 @@ static void EmitTextInfo(IndentedTextWriter writer, ref bool hasTextInfo, RegexM // Emit local to store current culture if needed if ((rm.Options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = rm.Code.FindOptimizations.FindMode switch + bool needsCulture = rm.Tree.FindOptimizations.FindMode switch { FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive => true, - _ when rm.Code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + _ when rm.Tree.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), _ => false, }; @@ -799,7 +799,7 @@ private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTex const int MaxUnrollSize = 16; RegexOptions options = (RegexOptions)rm.Options; - RegexCode code = rm.Code; + RegexTree regexTree = rm.Tree; RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None; // Helper to define names. Names start unadorned, but as soon as there's repetition, @@ -808,7 +808,7 @@ private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTex // Every RegexTree is rooted in the implicit Capture for the whole expression. // Skip the Capture node. We handle the implicit root capture specially. - RegexNode node = code.Tree.Root; + RegexNode node = regexTree.Root; Debug.Assert(node.Kind == RegexNodeKind.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); node = node.Child(0); @@ -847,7 +847,7 @@ private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTex writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); - bool hasTextInfo = EmitInitializeCultureForTryMatchAtCurrentPositionIfNecessary(writer, rm); + bool hasTextInfo = EmitInitializeCultureForTryMatchAtCurrentPositionIfNecessary(writer, rm, analysis); writer.Flush(); int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length; int additionalDeclarationsIndent = writer.Indent; @@ -1365,7 +1365,7 @@ void EmitBackreference(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Backreference, $"Unexpected type: {node.Kind}"); - int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Tree.CaptureNumberSparseMapping); if (sliceStaticPos > 0) { @@ -1447,7 +1447,7 @@ void EmitBackreferenceConditional(RegexNode node) TransferSliceStaticPosToPos(); // Get the capture number to test. - int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Tree.CaptureNumberSparseMapping); // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus // somewhat likely to be Empty. @@ -1758,8 +1758,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) Debug.Assert(node.Kind is RegexNodeKind.Capture, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); - int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); - int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Code.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Tree.CaptureNumberSparseMapping); + int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Tree.CaptureNumberSparseMapping); bool isAtomic = analysis.IsAtomicByAncestor(node); TransferSliceStaticPosToPos(); @@ -3415,29 +3415,12 @@ private static void EmitTimeoutCheck(IndentedTextWriter writer, bool hasTimeout) } } - private static bool EmitInitializeCultureForTryMatchAtCurrentPositionIfNecessary(IndentedTextWriter writer, RegexMethod rm) + private static bool EmitInitializeCultureForTryMatchAtCurrentPositionIfNecessary(IndentedTextWriter writer, RegexMethod rm, AnalysisResults analysis) { - if (((RegexOptions)rm.Options & RegexOptions.CultureInvariant) == 0) + if (analysis.HasIgnoreCase && ((RegexOptions)rm.Options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = ((RegexOptions)rm.Options & RegexOptions.IgnoreCase) != 0; - if (!needsCulture) - { - int[] codes = rm.Code.Codes; - for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize((RegexOpcode)codes[codepos])) - { - if (((RegexOpcode)codes[codepos] & RegexOpcode.CaseInsensitive) == RegexOpcode.CaseInsensitive) - { - needsCulture = true; - break; - } - } - } - - if (needsCulture) - { - writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); - return true; - } + writer.WriteLine("global::System.Globalization.TextInfo textInfo = global::System.Globalization.CultureInfo.CurrentCulture.TextInfo;"); + return true; } return false; @@ -3740,7 +3723,7 @@ private static string DescribeNode(RegexNode node, AnalysisResults analysis) => private static string DescribeCapture(int capNum, AnalysisResults analysis) { // If we can get a capture name from the captures collection and it's not just a numerical representation of the group, use it. - string name = RegexParser.GroupNameFromNumber(analysis.Code.Caps, analysis.Code.Tree.CapsList, analysis.Code.CapSize, capNum); + string name = RegexParser.GroupNameFromNumber(analysis.RegexTree.CaptureNumberSparseMapping, analysis.RegexTree.CaptureNames, analysis.RegexTree.CaptureCount, capNum); if (!string.IsNullOrEmpty(name) && (!int.TryParse(name, out int id) || id != capNum)) { diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs index 5053b535ae64e6..3fb4c923842c00 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs @@ -178,10 +178,10 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M } // Parse the input pattern - RegexCode code; + RegexTree tree; try { - code = RegexWriter.Write(RegexParser.Parse(pattern, regexOptions, culture), culture); + tree = RegexParser.Parse(pattern, regexOptions, culture); } catch (Exception e) { @@ -199,7 +199,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M pattern, regexOptions, matchTimeout ?? Timeout.Infinite, - code); + tree); var regexType = new RegexType( regexMethod, @@ -233,7 +233,7 @@ static bool IsAllowedKind(SyntaxKind kind) => } /// A regex method. - internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexCode Code); + internal sealed record RegexMethod(MethodDeclarationSyntax MethodSyntax, string MethodName, string Modifiers, string Pattern, RegexOptions Options, int MatchTimeout, RegexTree Tree); /// A type holding a regex method. internal sealed record RegexType(RegexMethod? Method, string Keyword, string Namespace, string Name) diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index 46fa6f7118da62..409d24373f5c4c 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -33,7 +33,6 @@ - @@ -45,7 +44,6 @@ - diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 676fe456dff0c6..b1888702e45616 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -26,11 +26,11 @@ - + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs index 4bf6af10683fb4..3e0adde46b93ab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Match.cs @@ -77,7 +77,7 @@ public bool IsMatch(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Run(quick: true, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0) is null; + return Run(quick: true, -1, input, 0, input.Length, RightToLeft ? input.Length : 0) is null; } /// @@ -87,7 +87,7 @@ public bool IsMatch(string input) /// if the regular expression finds a match; otherwise, . /// A time-out ocurred. public bool IsMatch(ReadOnlySpan input) => - Run(input, UseOptionR() ? input.Length : 0) is null; + Run(input, RightToLeft ? input.Length : 0) is null; /// /// Searches the input string for one or more matches using the previous pattern and options, @@ -132,7 +132,7 @@ public Match Match(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Run(quick: false, -1, input, 0, input.Length, UseOptionR() ? input.Length : 0)!; + return Run(quick: false, -1, input, 0, input.Length, RightToLeft ? input.Length : 0)!; } /// @@ -159,7 +159,7 @@ public Match Match(string input, int beginning, int length) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Run(quick: false, -1, input, beginning, length, UseOptionR() ? beginning + length : beginning)!; + return Run(quick: false, -1, input, beginning, length, RightToLeft ? beginning + length : beginning)!; } /// @@ -187,7 +187,7 @@ public MatchCollection Matches(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return new MatchCollection(this, input, UseOptionR() ? input.Length : 0); + return new MatchCollection(this, input, RightToLeft ? input.Length : 0); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs index ba1f9a91e44492..c1c8111cf3dd8e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Replace.cs @@ -42,7 +42,7 @@ public string Replace(string input, string replacement) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Replace(input, replacement, -1, UseOptionR() ? input.Length : 0); + return Replace(input, replacement, -1, RightToLeft ? input.Length : 0); } /// @@ -57,7 +57,7 @@ public string Replace(string input, string replacement, int count) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Replace(input, replacement, count, UseOptionR() ? input.Length : 0); + return Replace(input, replacement, count, RightToLeft ? input.Length : 0); } /// @@ -111,7 +111,7 @@ public string Replace(string input, MatchEvaluator evaluator) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Replace(evaluator, this, input, -1, UseOptionR() ? input.Length : 0); + return Replace(evaluator, this, input, -1, RightToLeft ? input.Length : 0); } /// @@ -125,7 +125,7 @@ public string Replace(string input, MatchEvaluator evaluator, int count) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Replace(evaluator, this, input, count, UseOptionR() ? input.Length : 0); + return Replace(evaluator, this, input, count, RightToLeft ? input.Length : 0); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs index 454aeacfa6df6c..327099750f6f60 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Split.cs @@ -35,7 +35,7 @@ public string[] Split(string input) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Split(this, input, 0, UseOptionR() ? input.Length : 0); + return Split(this, input, 0, RightToLeft ? input.Length : 0); } /// @@ -49,7 +49,7 @@ public string[] Split(string input, int count) ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); } - return Split(this, input, count, UseOptionR() ? input.Length : 0); + return Split(this, input, count, RightToLeft ? input.Length : 0); } /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs index c07558b20f1c57..57ee7df1c6914f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs @@ -20,8 +20,6 @@ namespace System.Text.RegularExpressions /// public partial class Regex : ISerializable { - internal const int MaxOptionShift = 11; - [StringSyntax(StringSyntaxAttribute.Regex)] protected internal string? pattern; // The string pattern provided protected internal RegexOptions roptions; // the top-level options from the options string @@ -33,7 +31,6 @@ public partial class Regex : ISerializable private WeakReference? _replref; // cached parsed replacement pattern private volatile RegexRunner? _runner; // cached runner - private RegexCode? _code; // if interpreted, this is the code for RegexInterpreter protected Regex() { @@ -63,64 +60,69 @@ public Regex([StringSyntax(StringSyntaxAttribute.Regex, "options")] string patte internal Regex(string pattern, CultureInfo? culture) { - // Call Init directly rather than delegating to a Regex ctor that takes - // options to enable linking / tree shaking to remove the Regex compiler - // and NonBacktracking implementation if it's not used. - Init(pattern, RegexOptions.None, s_defaultMatchTimeout, culture ?? CultureInfo.CurrentCulture); + // Validate arguments. + ValidatePattern(pattern); + + // Parse and store the argument information. + RegexTree tree = Init(pattern, RegexOptions.None, s_defaultMatchTimeout, ref culture); + + // Create the interpreter factory. + factory = new RegexInterpreterFactory(tree, culture); + + // NOTE: This overload _does not_ delegate to the one that takes options, in order + // to avoid unnecessarily rooting the support for RegexOptions.NonBacktracking/Compiler + // if no options are ever used. } internal Regex(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo? culture) { - culture ??= RegexParser.GetTargetCulture(options); - Init(pattern, options, matchTimeout, culture); + // Validate arguments. + ValidatePattern(pattern); + ValidateOptions(options); + ValidateMatchTimeout(matchTimeout); + + // Parse and store the argument information. + RegexTree tree = Init(pattern, options, matchTimeout, ref culture); + // Create the appropriate factory. if ((options & RegexOptions.NonBacktracking) != 0) { // If we're in non-backtracking mode, create the appropriate factory. - factory = new SymbolicRegexRunnerFactory(_code, options, matchTimeout, culture); - _code = null; + factory = new SymbolicRegexRunnerFactory(tree, options, matchTimeout, culture); } - else if (RuntimeFeature.IsDynamicCodeCompiled && UseOptionC()) + else { - // If the compile option is set and compilation is supported, then compile the code. - // If the compiler can't compile this regex, it'll return null, and we'll fall back - // to the interpreter. - factory = Compile(pattern, _code, options, matchTimeout != InfiniteMatchTimeout); - if (factory is not null) + if (RuntimeFeature.IsDynamicCodeCompiled && (options & RegexOptions.Compiled) != 0) { - _code = null; + // If the compile option is set and compilation is supported, then compile the code. + // If the compiler can't compile this regex, it'll return null, and we'll fall back + // to the interpreter. + factory = Compile(pattern, tree, options, matchTimeout != InfiniteMatchTimeout); } + + // If no factory was created, fall back to creating one for the interpreter. + factory ??= new RegexInterpreterFactory(tree, culture); } } - /// Initializes the instance. - /// - /// This is separated out of the constructor so that an app only using 'new Regex(pattern)' - /// rather than 'new Regex(pattern, options)' can avoid statically referencing the Regex - /// compiler, such that a tree shaker / linker can trim it away if it's not otherwise used. - /// - [MemberNotNull(nameof(_code))] - private void Init(string pattern, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) + /// Stores the supplied arguments and capture information, returning the parsed expression. + private RegexTree Init(string pattern, RegexOptions options, TimeSpan matchTimeout, [NotNull] ref CultureInfo? culture) { - ValidatePattern(pattern); - ValidateOptions(options); - ValidateMatchTimeout(matchTimeout); - this.pattern = pattern; - internalMatchTimeout = matchTimeout; roptions = options; + internalMatchTimeout = matchTimeout; + culture ??= RegexParser.GetTargetCulture(options); - // Parse the input - RegexTree tree = RegexParser.Parse(pattern, roptions, culture); + // Parse the pattern. + RegexTree tree = RegexParser.Parse(pattern, options, culture); - // Generate the RegexCode from the node tree. This is required for interpreting, - // and is used as input into RegexOptions.Compiled and RegexOptions.NonBacktracking. - _code = RegexWriter.Write(tree, culture); + // Store the relevant information, constructing the appropriate factory. + capnames = tree.CaptureNameToNumberMapping; + capslist = tree.CaptureNames; + caps = tree.CaptureNumberSparseMapping; + capsize = tree.CaptureCount; - capnames = tree.CapNames; - capslist = tree.CapsList; - caps = _code.Caps; - capsize = _code.CapSize; + return tree; } internal static void ValidatePattern(string pattern) @@ -133,9 +135,9 @@ internal static void ValidatePattern(string pattern) internal static void ValidateOptions(RegexOptions options) { + const int MaxOptionShift = 11; if (((((uint)options) >> MaxOptionShift) != 0) || - ((options & RegexOptions.ECMAScript) != 0 && - (options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.NonBacktracking | RegexOptions.CultureInvariant)) != 0)) + ((options & RegexOptions.ECMAScript) != 0 && (options & ~(RegexOptions.ECMAScript | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled | RegexOptions.NonBacktracking | RegexOptions.CultureInvariant)) != 0)) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.options); } @@ -199,8 +201,8 @@ protected IDictionary? CapNames /// instantiating a non-compiled regex. /// [MethodImpl(MethodImplOptions.NoInlining)] - private static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => - RegexCompiler.Compile(pattern, code, options, hasTimeout); + private static RegexRunnerFactory? Compile(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout) => + RegexCompiler.Compile(pattern, regexTree, options, hasTimeout); [Obsolete(Obsoletions.RegexCompileToAssemblyMessage, DiagnosticId = Obsoletions.RegexCompileToAssemblyDiagId, UrlFormat = Obsoletions.SharedUrlFormat)] public static void CompileToAssembly(RegexCompilationInfo[] regexinfos, AssemblyName assemblyname) => @@ -254,7 +256,7 @@ public static string Unescape(string str) /// /// Indicates whether the regular expression matches from right to left. /// - public bool RightToLeft => UseOptionR(); + public bool RightToLeft => (roptions & RegexOptions.RightToLeft) != 0; /// /// Returns the regular expression pattern passed into the constructor @@ -554,13 +556,14 @@ internal void Run(string input, int startat, ref TState state, MatchCall /// Creates a new runner instance. private RegexRunner CreateRunner() => - factory?.CreateInstance() ?? - new RegexInterpreter(_code!, RegexParser.GetTargetCulture(roptions)); + // The factory needs to be set by the ctor. `factory` is a protected field, so it's possible a derived + // type nulls out the factory after we've set it, but that's the nature of the design. + factory!.CreateInstance(); /// True if the option was set. protected bool UseOptionC() => (roptions & RegexOptions.Compiled) != 0; /// True if the option was set. - protected internal bool UseOptionR() => (roptions & RegexOptions.RightToLeft) != 0; + protected internal bool UseOptionR() => RightToLeft; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index a0be5ef4eb69ac..2161853dada55d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -68,8 +68,8 @@ internal abstract class RegexCompiler protected ILGenerator? _ilg; /// The options for the expression. protected RegexOptions _options; - /// The code written for the expression. - protected RegexCode? _code; + /// The written for the expression. + protected RegexTree? _regexTree; /// Whether this expression has a non-infinite timeout. protected bool _hasTimeout; @@ -93,8 +93,8 @@ internal abstract class RegexCompiler /// Entry point to dynamically compile a regular expression. The expression is compiled to /// an in-memory assembly. /// - internal static RegexRunnerFactory? Compile(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) => - new RegexLWCGCompiler().FactoryInstanceFromCode(pattern, code, options, hasTimeout); + internal static RegexRunnerFactory? Compile(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout) => + new RegexLWCGCompiler().FactoryInstanceFromCode(pattern, regexTree, options, hasTimeout); /// A macro for _ilg.DefineLabel private Label DefineLabel() => _ilg!.DefineLabel(); @@ -366,7 +366,7 @@ private void CallToLower() /// Generates the implementation for TryFindNextPossibleStartingPosition. protected void EmitTryFindNextPossibleStartingPosition() { - Debug.Assert(_code != null); + Debug.Assert(_regexTree != null); _int32LocalsPool?.Clear(); _readOnlySpanCharLocalsPool?.Clear(); @@ -377,13 +377,13 @@ protected void EmitTryFindNextPossibleStartingPosition() _textInfo = null; if ((_options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = _code.FindOptimizations.FindMode switch + bool needsCulture = _regexTree.FindOptimizations.FindMode switch { FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive or FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive => true, - _ when _code.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), + _ when _regexTree.FindOptimizations.FixedDistanceSets is List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets => sets.Exists(set => set.CaseInsensitive), _ => false, }; @@ -407,7 +407,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Generate length check. If the input isn't long enough to possibly match, fail quickly. // It's rare for min required length to be 0, so we don't bother special-casing the check, // especially since we want the "return false" code regardless. - int minRequiredLength = _code.Tree.MinRequiredLength; + int minRequiredLength = _regexTree.FindOptimizations.MinRequiredLength; Debug.Assert(minRequiredLength >= 0); Label returnFalse = DefineLabel(); Label finishedLengthCheck = DefineLabel(); @@ -442,28 +442,28 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or } // Either anchors weren't specified, or they don't completely root all matches to a specific location. - switch (_code.FindOptimizations.FindMode) + switch (_regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: - Debug.Assert(!string.IsNullOrEmpty(_code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf_LeftToRight(_code.FindOptimizations.LeadingCaseSensitivePrefix); + Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_LeftToRight(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix); break; case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: - Debug.Assert(_code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); EmitFixedSet_LeftToRight(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: - Debug.Assert(_code.FindOptimizations.LiteralAfterLoop is not null); + Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null); EmitLiteralAfterAtomicLoop(); break; default: - Debug.Fail($"Unexpected mode: {_code.FindOptimizations.FindMode}"); + Debug.Fail($"Unexpected mode: {_regexTree.FindOptimizations.FindMode}"); goto case FindNextStartingPositionMode.NoSearch; case FindNextStartingPositionMode.NoSearch: @@ -480,7 +480,7 @@ bool GenerateAnchors() Label label; // Anchors that fully implement TryFindNextPossibleStartingPosition, with a check that leads to immediate success or failure determination. - switch (_code.FindOptimizations.FindMode) + switch (_regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: label = DefineLabel(); @@ -538,16 +538,16 @@ bool GenerateAnchors() case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: // Jump to the end, minus the min required length, which in this case is actually the fixed length. { - int extraNewlineBump = _code.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; + int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; label = DefineLabel(); Ldloc(pos); Ldloc(end); - Ldc(_code.Tree.MinRequiredLength + extraNewlineBump); + Ldc(_regexTree.FindOptimizations.MinRequiredLength + extraNewlineBump); Sub(); Bge(label); Ldthis(); Ldloc(end); - Ldc(_code.Tree.MinRequiredLength + extraNewlineBump); + Ldc(_regexTree.FindOptimizations.MinRequiredLength + extraNewlineBump); Sub(); Stfld(s_runtextposField); MarkLabel(label); @@ -559,7 +559,7 @@ bool GenerateAnchors() // Now handle anchors that boost the position but don't determine immediate success or failure. - switch (_code.FindOptimizations.LeadingAnchor) + switch (_regexTree.FindOptimizations.LeadingAnchor) { case RegexNodeKind.Bol: { @@ -625,12 +625,12 @@ bool GenerateAnchors() break; } - switch (_code.FindOptimizations.TrailingAnchor) + switch (_regexTree.FindOptimizations.TrailingAnchor) { - case RegexNodeKind.End or RegexNodeKind.EndZ when _code.FindOptimizations.MaxPossibleLength is int maxLength: + case RegexNodeKind.End or RegexNodeKind.EndZ when _regexTree.FindOptimizations.MaxPossibleLength is int maxLength: // Jump to the end, minus the max allowed length. { - int extraNewlineBump = _code.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; + int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; label = DefineLabel(); Ldloc(pos); Ldloc(end); @@ -683,7 +683,7 @@ void EmitIndexOf_LeftToRight(string prefix) void EmitFixedSet_LeftToRight() { - List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _code.FindOptimizations.FixedDistanceSets; + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _regexTree.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; const int MaxSets = 4; int setsToUse = Math.Min(sets.Count, MaxSets); @@ -882,8 +882,8 @@ void EmitFixedSet_LeftToRight() // Emits a search for a literal following a leading atomic single-character loop. void EmitLiteralAfterAtomicLoop() { - Debug.Assert(_code.FindOptimizations.LiteralAfterLoop is not null); - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _code.FindOptimizations.LiteralAfterLoop.Value; + Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null); + (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal) target = _regexTree.FindOptimizations.LiteralAfterLoop.Value; Debug.Assert(target.LoopNode.Kind is RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic); Debug.Assert(target.LoopNode.N == int.MaxValue); @@ -1048,12 +1048,12 @@ protected void EmitTryMatchAtCurrentPosition() // "doneLabel" is simply the final return location from the TryMatchAtCurrentPosition method that will undo any captures and exit, signaling to // the calling scan loop that nothing was matched. - Debug.Assert(_code != null); + Debug.Assert(_regexTree != null); _int32LocalsPool?.Clear(); _readOnlySpanCharLocalsPool?.Clear(); // Get the root Capture node of the tree. - RegexNode node = _code.Tree.Root; + RegexNode node = _regexTree.Root; Debug.Assert(node.Kind == RegexNodeKind.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); @@ -1090,6 +1090,8 @@ protected void EmitTryMatchAtCurrentPosition() // performance. Since that's not applicable to RegexCompiler, that code isn't mirrored here. } + AnalysisResults analysis = RegexTreeAnalyzer.Analyze(_regexTree); + // Initialize the main locals used throughout the implementation. LocalBuilder inputSpan = DeclareReadOnlySpanChar(); LocalBuilder originalPos = DeclareInt32(); @@ -1104,7 +1106,7 @@ protected void EmitTryMatchAtCurrentPosition() } // CultureInfo culture = CultureInfo.CurrentCulture; // only if the whole expression or any subportion is ignoring case, and we're not using invariant - InitializeCultureForTryMatchAtCurrentPositionIfNecessary(); + InitializeCultureForTryMatchAtCurrentPositionIfNecessary(analysis); // ReadOnlySpan inputSpan = input; // int end = base.runtextend; @@ -1133,8 +1135,6 @@ protected void EmitTryMatchAtCurrentPosition() int sliceStaticPos = 0; SliceInputSpan(); - AnalysisResults analysis = RegexTreeAnalyzer.Analyze(_code); - // Check whether there are captures anywhere in the expression. If there isn't, we can skip all // the boilerplate logic around uncapturing, as there won't be anything to uncapture. bool expressionHasCaptures = analysis.MayContainCapture(node); @@ -1470,7 +1470,7 @@ void EmitBackreference(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Backreference, $"Unexpected type: {node.Kind}"); - int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping); TransferSliceStaticPosToPos(); @@ -1569,7 +1569,7 @@ void EmitBackreferenceConditional(RegexNode node) TransferSliceStaticPosToPos(); // Get the capture number to test. - int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping); // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus // somewhat likely to be Empty. @@ -1889,8 +1889,8 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) Debug.Assert(node.Kind is RegexNodeKind.Capture, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); - int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); - int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps); + int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping); + int uncapnum = RegexParser.MapCaptureNumber(node.N, _regexTree.CaptureNumberSparseMapping); bool isAtomic = analysis.IsAtomicByAncestor(node); // pos += sliceStaticPos; @@ -4016,31 +4016,14 @@ protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, Dynamic Ret(); } - private void InitializeCultureForTryMatchAtCurrentPositionIfNecessary() + private void InitializeCultureForTryMatchAtCurrentPositionIfNecessary(AnalysisResults analysis) { _textInfo = null; - if ((_options & RegexOptions.CultureInvariant) == 0) + if (analysis.HasIgnoreCase && (_options & RegexOptions.CultureInvariant) == 0) { - bool needsCulture = (_options & RegexOptions.IgnoreCase) != 0; - if (!needsCulture) - { - int[] codes = _code!.Codes; - for (int codepos = 0; codepos < codes.Length; codepos += RegexCode.OpcodeSize((RegexOpcode)codes[codepos])) - { - if (((RegexOpcode)codes[codepos] & RegexOpcode.CaseInsensitive) == RegexOpcode.CaseInsensitive) - { - needsCulture = true; - break; - } - } - } - - if (needsCulture) - { - // cache CultureInfo in local variable which saves excessive thread local storage accesses - _textInfo = DeclareTextInfo(); - InitLocalCultureInfo(); - } + // cache CultureInfo in local variable which saves excessive thread local storage accesses + _textInfo = DeclareTextInfo(); + InitLocalCultureInfo(); } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index e8d727582de52d..181a85863c45da 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -10,9 +10,6 @@ namespace System.Text.RegularExpressions /// Contains state and provides operations related to finding the next location a match could possibly begin. internal sealed class RegexFindOptimizations { - /// The minimum required length an input need be to match the pattern. - /// 0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression. - private readonly int _minRequiredLength; /// True if the input should be processed right-to-left rather than left-to-right. private readonly bool _rightToLeft; /// Provides the ToLower routine for lowercasing characters. @@ -20,15 +17,16 @@ internal sealed class RegexFindOptimizations /// Lookup table used for optimizing ASCII when doing set queries. private readonly uint[]?[]? _asciiLookups; - public RegexFindOptimizations(RegexTree tree, CultureInfo culture) + public RegexFindOptimizations(RegexNode root, RegexOptions options, CultureInfo culture) { - _rightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; - _minRequiredLength = tree.MinRequiredLength; + _rightToLeft = (options & RegexOptions.RightToLeft) != 0; _textInfo = culture.TextInfo; + MinRequiredLength = root.ComputeMinLength(); + // Compute any anchor starting the expression. If there is one, we won't need to search for anything, // as we can just match at that single location. - LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(tree.Root); + LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root); if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol) { // Filter out Bol for RightToLeft, as we don't currently optimize for it. @@ -56,15 +54,15 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) { bool triedToComputeMaxLength = false; - TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(tree.Root); + TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root); if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ) { triedToComputeMaxLength = true; - if (tree.Root.ComputeMaxLength() is int maxLength) + if (root.ComputeMaxLength() is int maxLength) { - Debug.Assert(maxLength >= _minRequiredLength, $"{maxLength} should have been greater than {_minRequiredLength} minimum"); + Debug.Assert(maxLength >= MinRequiredLength, $"{maxLength} should have been greater than {MinRequiredLength} minimum"); MaxPossibleLength = maxLength; - if (_minRequiredLength == maxLength) + if (MinRequiredLength == maxLength) { FindMode = TrailingAnchor == RegexNodeKind.End ? FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End : @@ -74,16 +72,16 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) } } - if ((tree.Options & RegexOptions.NonBacktracking) != 0 && !triedToComputeMaxLength) + if ((options & RegexOptions.NonBacktracking) != 0 && !triedToComputeMaxLength) { // NonBacktracking also benefits from knowing whether the pattern is a fixed length, as it can use that // knowledge to avoid multiple match phases in some situations. - MaxPossibleLength = tree.Root.ComputeMaxLength(); + MaxPossibleLength = root.ComputeMaxLength(); } } // If there's a leading case-sensitive substring, just use IndexOf and inherit all of its optimizations. - string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(tree.Root); + string caseSensitivePrefix = RegexPrefixAnalyzer.FindCaseSensitivePrefix(root); if (caseSensitivePrefix.Length > 1) { LeadingCaseSensitivePrefix = caseSensitivePrefix; @@ -98,8 +96,8 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) // If we're compiling, then the compilation process already handles sets that reduce to a single literal, // so we can simplify and just always go for the sets. - bool dfa = (tree.Options & RegexOptions.NonBacktracking) != 0; - bool compiled = (tree.Options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled + bool dfa = (options & RegexOptions.NonBacktracking) != 0; + bool compiled = (options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled bool interpreter = !compiled && !dfa; // For interpreter, we want to employ optimizations, but we don't want to make construction significantly @@ -109,7 +107,7 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) if (_rightToLeft) { // Determine a set for anything that can possibly start the expression. - if (RegexPrefixAnalyzer.FindFirstCharClass(tree, culture) is (string CharClass, bool CaseInsensitive) set) + if (RegexPrefixAnalyzer.FindFirstCharClass(root, culture) is (string CharClass, bool CaseInsensitive) set) { // See if the set is limited to holding only a few characters. Span scratch = stackalloc char[5]; // max optimized by IndexOfAny today @@ -148,10 +146,10 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) // As a backup, see if we can find a literal after a leading atomic loop. That might be better than whatever sets we find, so // we want to know whether we have one in our pocket before deciding whether to use a leading set. - (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(tree); + (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root); // Build up a list of all of the sets that are a fixed distance from the start of the expression. - List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(tree, culture, thorough: !interpreter); + List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, culture, thorough: !interpreter); Debug.Assert(fixedDistanceSets is null || fixedDistanceSets.Count != 0); // If we got such sets, we'll likely use them. However, if the best of them is something that doesn't support a vectorized @@ -214,6 +212,10 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) /// Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed. public RegexNodeKind TrailingAnchor { get; } + /// Gets the minimum required length an input need be to match the pattern. + /// 0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression. + public int MinRequiredLength { get; } + /// The maximum possible length an input could be to match the pattern. /// /// This is currently only set when is found to be an end anchor. @@ -246,7 +248,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // Return early if we know there's not enough input left to match. if (!_rightToLeft) { - if (pos > end - _minRequiredLength) + if (pos > end - MinRequiredLength) { pos = end; return false; @@ -254,7 +256,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos } else { - if (pos - _minRequiredLength < beginning) + if (pos - MinRequiredLength < beginning) { pos = beginning; return false; @@ -351,16 +353,16 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos return true; case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: - if (pos < end - _minRequiredLength - 1) + if (pos < end - MinRequiredLength - 1) { - pos = end - _minRequiredLength - 1; + pos = end - MinRequiredLength - 1; } return true; case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End: - if (pos < end - _minRequiredLength) + if (pos < end - MinRequiredLength) { - pos = end - _minRequiredLength; + pos = end - MinRequiredLength; } return true; @@ -522,7 +524,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseSensitive: { - Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength); int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); if (i >= 0) @@ -537,7 +539,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.FixedLiteral_LeftToRight_CaseInsensitive: { - Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); + Debug.Assert(FixedDistanceLiteral.Distance <= MinRequiredLength); char ch = FixedDistanceLiteral.Literal; TextInfo ti = _textInfo; @@ -562,7 +564,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos { List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; (char[]? primaryChars, string primarySet, int primaryDistance, _) = sets[0]; - int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + int endMinusRequiredLength = end - Math.Max(1, MinRequiredLength); if (primaryChars is not null) { @@ -637,7 +639,7 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)> sets = FixedDistanceSets!; (_, string primarySet, int primaryDistance, _) = sets[0]; - int endMinusRequiredLength = end - Math.Max(1, _minRequiredLength); + int endMinusRequiredLength = end - Math.Max(1, MinRequiredLength); TextInfo ti = _textInfo; ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 1092db83c243fe..ee55c5646cf160 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -8,12 +8,26 @@ namespace System.Text.RegularExpressions { + /// A for creating s. + internal sealed class RegexInterpreterFactory : RegexRunnerFactory + { + private readonly RegexInterpreterCode _code; + + public RegexInterpreterFactory(RegexTree tree, CultureInfo culture) => + // Generate and store the RegexInterpretedCode for the RegexTree and the specified culture + _code = RegexWriter.Write(tree, culture); + + protected internal override RegexRunner CreateInstance() => + // Create a new interpreter instance. + new RegexInterpreter(_code, RegexParser.GetTargetCulture(_code.Options)); + } + /// Executes a block of regular expression codes while consuming input. internal sealed class RegexInterpreter : RegexRunner { private const int LoopTimeoutCheckCount = 2048; // conservative value to provide reasonably-accurate timeout handling. - private readonly RegexCode _code; + private readonly RegexInterpreterCode _code; private readonly TextInfo _textInfo; private RegexOpcode _operator; @@ -21,7 +35,7 @@ internal sealed class RegexInterpreter : RegexRunner private bool _rightToLeft; private bool _caseInsensitive; - public RegexInterpreter(RegexCode code, CultureInfo culture) + public RegexInterpreter(RegexInterpreterCode code, CultureInfo culture) { Debug.Assert(code != null, "code must not be null."); Debug.Assert(culture != null, "culture must not be null."); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreterCode.cs similarity index 86% rename from src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs rename to src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreterCode.cs index f31d9301c47f19..87eb2e8d75e420 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreterCode.cs @@ -1,19 +1,18 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Collections; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; -using System.Globalization; namespace System.Text.RegularExpressions { - /// Representation of a regular expression, written by and containing the code evaluated by . - /// It currently stores some data used by engines other than the interpreter; that can be refactored out in the future. - internal sealed class RegexCode + /// Contains the code, written by , for to evaluate a regular expression. + internal sealed class RegexInterpreterCode { - /// The optimized parse tree. - public readonly RegexTree Tree; + /// Find logic to use to find the next possible location for a match. + public readonly RegexFindOptimizations FindOptimizations; + /// The options associated with the regex. + public readonly RegexOptions Options; /// RegexOpcodes and arguments written by . public readonly int[] Codes; /// The string / set table. includes offsets into this table, for string and set arguments. @@ -22,26 +21,15 @@ internal sealed class RegexCode public readonly uint[]?[] StringsAsciiLookup; /// How many instructions in use backtracking. public readonly int TrackCount; - /// Mapping of user group numbers to impl group slots. - public readonly Hashtable? Caps; - /// Number of impl group slots. - public readonly int CapSize; - /// True if right to left. - public readonly bool RightToLeft; - /// Optimization mode and supporting data to enable quickly finding the next possible match location. - public readonly RegexFindOptimizations FindOptimizations; - public RegexCode(RegexTree tree, CultureInfo culture, int[] codes, string[] strings, int trackcount, Hashtable? caps, int capsize) + public RegexInterpreterCode(RegexFindOptimizations findOptimizations, RegexOptions options, int[] codes, string[] strings, int trackcount) { - Tree = tree; + FindOptimizations = findOptimizations; + Options = options; Codes = codes; Strings = strings; StringsAsciiLookup = new uint[strings.Length][]; TrackCount = trackcount; - Caps = caps; - CapSize = capsize; - RightToLeft = (tree.Options & RegexOptions.RightToLeft) != 0; - FindOptimizations = new RegexFindOptimizations(tree, culture); } /// Gets whether the specified opcode may incur backtracking. @@ -152,8 +140,7 @@ public override string ToString() { var sb = new StringBuilder(); - sb.AppendLine($"Direction: {(RightToLeft ? "right-to-left" : "left-to-right")}"); - sb.AppendLine($"Anchor: {FindOptimizations.LeadingAnchor}"); + sb.AppendLine($"Direction: {((Options & RegexOptions.RightToLeft) != 0 ? "right-to-left" : "left-to-right")}"); sb.AppendLine(); for (int i = 0; i < Codes.Length; i += OpcodeSize((RegexOpcode)Codes[i])) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index fe467efd05a41f..60b1398b0268f6 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -30,14 +30,14 @@ internal sealed class RegexLWCGCompiler : RegexCompiler private static int s_regexCount; /// The top-level driver. Initializes everything then calls the Generate* methods. - public RegexRunnerFactory? FactoryInstanceFromCode(string pattern, RegexCode code, RegexOptions options, bool hasTimeout) + public RegexRunnerFactory? FactoryInstanceFromCode(string pattern, RegexTree regexTree, RegexOptions options, bool hasTimeout) { - if (!code.Tree.Root.SupportsCompilation()) + if (!regexTree.Root.SupportsCompilation()) { return null; } - _code = code; + _regexTree = regexTree; _options = options; _hasTimeout = hasTimeout; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 0807f9cfcc923b..93df3aef238adc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -497,8 +497,29 @@ private void EliminateEndingBacktracking() /// /// Removes redundant nodes from the subtree, and returns an optimized subtree. /// - internal RegexNode Reduce() => - Kind switch + internal RegexNode Reduce() + { + // TODO: https://github.com/dotnet/runtime/issues/61048 + // As part of overhauling IgnoreCase handling, the parser shouldn't produce any nodes other than Backreference + // that ever have IgnoreCase set on them. For now, though, remove IgnoreCase from any nodes for which it + // has no behavioral effect. + switch (Kind) + { + default: + // No effect + Options &= ~RegexOptions.IgnoreCase; + break; + + case RegexNodeKind.One or RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic: + case RegexNodeKind.Notone or RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic: + case RegexNodeKind.Set or RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic: + case RegexNodeKind.Multi: + case RegexNodeKind.Backreference: + // Still meaningful + break; + } + + return Kind switch { RegexNodeKind.Alternate => ReduceAlternation(), RegexNodeKind.Atomic => ReduceAtomic(), @@ -512,6 +533,7 @@ internal RegexNode Reduce() => RegexNodeKind.BackreferenceConditional => ReduceTestref(), _ => this, }; + } /// Remove an unnecessary Concatenation or Alternation node /// diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs index eaaa355aa97005..4a7a4ea6d2741d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs @@ -5,7 +5,7 @@ namespace System.Text.RegularExpressions { /// Opcodes written by and used by to process a regex. /// - /// stores an int[] containing all of the codes that make up the instructions for + /// stores an int[] containing all of the codes that make up the instructions for /// the interpreter to process the regular expression. The array contains a packed sequence of operations, /// each of which is an stored as an int, followed immediately by all of the operands /// required for that operation. For example, the subexpression `a{2,7}[^b]` would be represented as the sequence diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 43a62f3ca1621b..bd89292697a28a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -76,28 +76,41 @@ private RegexParser(string pattern, RegexOptions options, CultureInfo culture, H _ignoreNextParen = false; } - private RegexParser(string pattern, RegexOptions options, CultureInfo culture, Span optionSpan) - : this(pattern, options, culture, new Hashtable(), default, null, optionSpan) - { - } - /// Gets the culture to use based on the specified options. internal static CultureInfo GetTargetCulture(RegexOptions options) => (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo culture) { - var parser = new RegexParser(pattern, options, culture, stackalloc int[OptionStackDefaultSize]); + using var parser = new RegexParser(pattern, options, culture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]); parser.CountCaptures(); parser.Reset(options); RegexNode root = parser.ScanRegex(); - int minRequiredLength = root.ComputeMinLength(); - string[]? capnamelist = parser._capnamelist?.ToArray(); - var tree = new RegexTree(root, parser._caps, parser._capnumlist!, parser._captop, parser._capnames!, capnamelist!, options, minRequiredLength); - parser.Dispose(); - return tree; + int[]? captureNumberList = parser._capnumlist; + Hashtable? sparseMapping = parser._caps; + int captop = parser._captop; + + int captureCount; + if (captureNumberList == null || captop == captureNumberList.Length) + { + // The capture list isn't sparse. Null out the capture mapping as it's not necessary, + // and store the number of captures. + captureCount = captop; + sparseMapping = null; + } + else + { + // The capture list is sparse. Store the number of captures, and populate the number-to-names-list. + captureCount = captureNumberList.Length; + for (int i = 0; i < captureNumberList.Length; i++) + { + sparseMapping[captureNumberList[i]] = i; + } + } + + return new RegexTree(root, captureCount, parser._capnamelist?.ToArray(), parser._capnames!, sparseMapping, options, culture); } /// @@ -106,11 +119,10 @@ public static RegexTree Parse(string pattern, RegexOptions options, CultureInfo public static RegexReplacement ParseReplacement(string pattern, RegexOptions options, Hashtable caps, int capsize, Hashtable capnames) { CultureInfo culture = (options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; - var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, stackalloc int[OptionStackDefaultSize]); + using var parser = new RegexParser(pattern, options, culture, caps, capsize, capnames, stackalloc int[OptionStackDefaultSize]); RegexNode root = parser.ScanReplacement(); var regexReplacement = new RegexReplacement(pattern, root, caps); - parser.Dispose(); return regexReplacement; } @@ -198,7 +210,7 @@ public static string Unescape(string input) private static string UnescapeImpl(string input, int i) { - var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, stackalloc int[OptionStackDefaultSize]); + using var parser = new RegexParser(input, RegexOptions.None, CultureInfo.InvariantCulture, new Hashtable(), 0, null, stackalloc int[OptionStackDefaultSize]); // In the worst case the escaped string has the same length. // For small inputs we use stack allocation. @@ -226,8 +238,6 @@ private static string UnescapeImpl(string input, int i) vsb.Append(input.AsSpan(lastpos, i - lastpos)); } while (i < input.Length); - parser.Dispose(); - return vsb.ToString(); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index c3e80fc9913718..09b19675e5f92e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -191,12 +191,11 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) } /// Finds sets at fixed-offsets from the beginning of the pattern/ - /// The RegexNode tree. + /// The RegexNode tree root. /// The culture to use for any case conversions. /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete. /// The array of found sets, or null if there aren't any. - public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets( - RegexTree tree, CultureInfo culture, bool thorough) + public static List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FindFixedDistanceSets(RegexNode root, CultureInfo culture, bool thorough) { const int MaxLoopExpansion = 20; // arbitrary cut-off to avoid loops adding significant overhead to processing const int MaxFixedResults = 50; // arbitrary cut-off to avoid generating lots of sets unnecessarily @@ -204,13 +203,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) // Find all fixed-distance sets. var results = new List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>(); int distance = 0; - TryFindFixedSets(tree.Root, results, ref distance, culture, thorough); -#if DEBUG - foreach ((char[]? Chars, string Set, int Distance, bool CaseInsensitive) result in results) - { - Debug.Assert(result.Distance <= tree.MinRequiredLength, $"Min: {tree.MinRequiredLength}, Distance: {result.Distance}, Tree: {tree}"); - } -#endif + TryFindFixedSets(root, results, ref distance, culture, thorough); // Remove any sets that match everything; they're not helpful. (This check exists primarily to weed // out use of . in Singleline mode.) @@ -233,7 +226,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) // doesn't. if (results.Count == 0) { - (string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(tree, culture); + (string CharClass, bool CaseInsensitive)? first = FindFirstCharClass(root, culture); if (first is not null) { results.Add((null, first.Value.CharClass, 0, first.Value.CaseInsensitive)); @@ -540,10 +533,10 @@ static bool TryFindFixedSets(RegexNode node, List<(char[]? Chars, string Set, in /// variable position, but this will find [ab] as it's instead looking for anything that under any /// circumstance could possibly start a match. /// - public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexTree tree, CultureInfo culture) + public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(RegexNode root, CultureInfo culture) { var s = new RegexPrefixAnalyzer(stackalloc int[StackBufferSize]); - RegexFC? fc = s.RegexFCFromRegexTree(tree); + RegexFC? fc = s.RegexFCFromRegexTree(root); s.Dispose(); if (fc == null || fc._nullable) @@ -563,9 +556,8 @@ public static (string CharClass, bool CaseInsensitive)? FindFirstCharClass(Regex /// Analyzes the pattern for a leading set loop followed by a non-overlapping literal. If such a pattern is found, an implementation /// can search for the literal and then walk backward through all matches for the loop until the beginning is found. /// - public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexTree tree) + public static (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? FindLiteralFollowingLeadingLoop(RegexNode node) { - RegexNode node = tree.Root; if ((node.Options & RegexOptions.RightToLeft) != 0) { // As a simplification, ignore RightToLeft. @@ -788,9 +780,9 @@ private RegexFC PopFC() /// through the tree and calls CalculateFC to emits code before /// and after each child of an interior node, and at each leaf. /// - private RegexFC? RegexFCFromRegexTree(RegexTree tree) + private RegexFC? RegexFCFromRegexTree(RegexNode root) { - RegexNode? curNode = tree.Root; + RegexNode? curNode = root; int curChild = 0; while (true) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs index 2394cddeeecf7a..8e7ada11b4926a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTree.cs @@ -2,31 +2,79 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Collections; +using System.Diagnostics; +using System.Globalization; namespace System.Text.RegularExpressions { - /// Wrapper for a node tree with additional information attached. + /// + /// Provides the core data describing a parsed tree, along with necessary + /// information about captures in the tree and computed optimizations about its structure. + /// internal sealed class RegexTree { - public readonly RegexNode Root; - public readonly Hashtable Caps; - public readonly int[] CapNumList; - public readonly int CapTop; - public readonly Hashtable CapNames; - public readonly string[] CapsList; + /// The options associated with the regular expression. public readonly RegexOptions Options; - public readonly int MinRequiredLength; + /// The root node of the parsed tree. + public readonly RegexNode Root; + /// The "find" optimizations computed for the regular expression to quickly find the next viable location to start looking for a match. + public readonly RegexFindOptimizations FindOptimizations; + /// The number of captures in the regex. + public readonly int CaptureCount; + /// A list of all the captures' names. + /// + /// For numbered (implicitly or explicitly) captures, these are string representations of the numbers. This may be null if all captures were numbered + /// and dense, e.g. for `(a)(bc)(def)` and `(?<1>a)(?<2>bc)(?<3>def)` this will be null, but it will be non-null for + /// `(?<1>a)(?<2>bc)(?<4>def)` as well as for `(?<2>a)(?<3>bc)(?<4>def)`, as the groups now have a gap in the numbering. + /// + public readonly string[]? CaptureNames; + /// A mapping of capture group name to capture group number. + /// This is null iff is not null. + public readonly Hashtable? CaptureNameToNumberMapping; + /// A mapping of capture group number to the associated name slot in . + /// + /// This is non-null if the mapping is sparse. If non-null, each key/value pair entry represents one capture group, where the key is the + /// capture group number and the value is the index into for that capture group. + /// + public readonly Hashtable? CaptureNumberSparseMapping; - internal RegexTree(RegexNode root, Hashtable caps, int[] capNumList, int capTop, Hashtable capNames, string[] capsList, RegexOptions options, int minRequiredLength) + internal RegexTree(RegexNode root, int captureCount, string[]? captureNames, Hashtable? captureNameToNumberMapping, Hashtable? captureNumberSparseMapping, RegexOptions options, CultureInfo culture) { +#if DEBUG + // Asserts to both demonstrate and validate the relationships between the various capture data structures. + Debug.Assert(captureNumberSparseMapping is null || captureNames is not null); + Debug.Assert((captureNames is null) == (captureNameToNumberMapping is null)); + Debug.Assert(captureNames is null || captureCount == captureNames.Length); + Debug.Assert(captureNumberSparseMapping is null || captureCount == captureNumberSparseMapping.Count); + Debug.Assert(captureNameToNumberMapping is null || captureCount == captureNameToNumberMapping.Count); + if (captureNames is not null) + { + Debug.Assert(captureNameToNumberMapping is not null); + for (int i = 0; i < captureNames.Length; i++) + { + string captureName = captureNames[i]; + + int? captureNumber = captureNameToNumberMapping[captureName] as int?; + Debug.Assert(captureNumber is not null); + + if (captureNumberSparseMapping is not null) + { + captureNumber = captureNumberSparseMapping[captureNumber] as int?; + Debug.Assert(captureNumber is not null); + } + + Debug.Assert(captureNumber == i); + } + } +#endif + Root = root; - Caps = caps; - CapNumList = capNumList; - CapTop = capTop; - CapNames = capNames; - CapsList = capsList; + CaptureNumberSparseMapping = captureNumberSparseMapping; + CaptureCount = captureCount; + CaptureNameToNumberMapping = captureNameToNumberMapping; + CaptureNames = captureNames; Options = options; - MinRequiredLength = minRequiredLength; + FindOptimizations = new RegexFindOptimizations(root, options, culture); } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs index 990456b3aff89c..4a7db7591490fc 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs @@ -9,11 +9,11 @@ namespace System.Text.RegularExpressions /// Analyzes a of s to produce data on the tree structure, in particular in support of code generation. internal static class RegexTreeAnalyzer { - /// Analyzes a to learn about the structure of the tree. - public static AnalysisResults Analyze(RegexCode code) + /// Analyzes a to learn about the structure of the tree. + public static AnalysisResults Analyze(RegexTree regexTree) { - var results = new AnalysisResults(code); - results._complete = TryAnalyze(code.Tree.Root, results, isAtomicByAncestor: true); + var results = new AnalysisResults(regexTree); + results._complete = TryAnalyze(regexTree.Root, results, isAtomicByAncestor: true); return results; static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor) @@ -23,6 +23,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA return false; } + // Track whether we've seen any node with IgnoreCase set. + results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0; + if (isAtomicByAncestor) { // We've been told by our parent that we should be considered atomic, so add ourselves @@ -144,13 +147,15 @@ internal sealed class AnalysisResults internal readonly HashSet _containsCapture = new(); // the root is a capture, so this will always contain at least the root node /// Set of nodes that directly or indirectly contain backtracking constructs that aren't hidden internaly by atomic constructs. internal HashSet? _mayBacktrack; + /// Whether any node has set. + internal bool _hasIgnoreCase; /// Initializes the instance. - /// The code being analyzed. - internal AnalysisResults(RegexCode code) => Code = code; + /// The code being analyzed. + internal AnalysisResults(RegexTree regexTree) => RegexTree = regexTree; /// Gets the code that was analyzed. - public RegexCode Code { get; } + public RegexTree RegexTree { get; } /// Gets whether a node is considered atomic based on its ancestry. public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node); @@ -168,5 +173,8 @@ internal sealed class AnalysisResults /// true for any node that requires backtracking. /// public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false); + + /// Gets whether a node might have set. + public bool HasIgnoreCase => _complete && _hasIgnoreCase; } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 9fc977fce9587d..4dcbcbc33b9407 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -1,7 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Globalization; @@ -21,10 +20,11 @@ internal ref struct RegexWriter private const int EmittedSize = 64; private const int IntStackSize = 32; + private readonly RegexTree _tree; + private readonly CultureInfo _culture; private readonly Dictionary _stringTable; private ValueListBuilder _emitted; private ValueListBuilder _intStack; - private Hashtable? _caps; private int _trackCount; #if DEBUG @@ -35,66 +35,50 @@ static RegexWriter() } #endif - private RegexWriter(Span emittedSpan, Span intStackSpan) + private RegexWriter(RegexTree tree, CultureInfo culture, Span emittedSpan, Span intStackSpan) { + _tree = tree; + _culture = culture; _emitted = new ValueListBuilder(emittedSpan); _intStack = new ValueListBuilder(intStackSpan); _stringTable = new Dictionary(); - _caps = null; _trackCount = 0; } /// - /// This is the only function that should be called from outside. - /// It takes a and creates a corresponding . + /// Return rented buffers. /// - public static RegexCode Write(RegexTree tree, CultureInfo culture) + public void Dispose() { - using var writer = new RegexWriter(stackalloc int[EmittedSize], stackalloc int[IntStackSize]); - return writer.RegexCodeFromRegexTree(tree, culture); + _emitted.Dispose(); + _intStack.Dispose(); } /// - /// Return rented buffers. + /// This is the only function that should be called from outside. + /// It takes a and creates a corresponding . /// - public void Dispose() + public static RegexInterpreterCode Write(RegexTree tree, CultureInfo culture) { - _emitted.Dispose(); - _intStack.Dispose(); + using var writer = new RegexWriter(tree, culture, stackalloc int[EmittedSize], stackalloc int[IntStackSize]); + return writer.EmitCode(); } /// - /// The top level RegexCode generator. It does a depth-first walk + /// The top level RegexInterpreterCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emit code before /// and after each child of an interior node and at each leaf. /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// - public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture) + private RegexInterpreterCode EmitCode() { - // Construct sparse capnum mapping if some numbers are unused. - int capsize; - if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) - { - capsize = tree.CapTop; - _caps = null; - } - else - { - capsize = tree.CapNumList.Length; - _caps = tree.Caps; - for (int i = 0; i < tree.CapNumList.Length; i++) - { - _caps[tree.CapNumList[i]] = i; - } - } - // Every written code begins with a lazy branch. This will be back-patched // to point to the ending Stop after the whole expression has been written. Emit(RegexOpcode.Lazybranch, 0); // Emit every node. - RegexNode curNode = tree.Root; + RegexNode curNode = _tree.Root; int curChild = 0; while (true) { @@ -138,7 +122,7 @@ public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture) } // Return all that in a RegexCode object. - return new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize); + return new RegexInterpreterCode(_tree.FindOptimizations, _tree.Options, emitted, strings, _trackCount); } /// @@ -157,7 +141,7 @@ private void PatchJump(int offset, int jumpDest) /// private void Emit(RegexOpcode op) { - if (RegexCode.OpcodeBacktracks(op)) + if (RegexInterpreterCode.OpcodeBacktracks(op)) { _trackCount++; } @@ -168,7 +152,7 @@ private void Emit(RegexOpcode op) /// Emits a one-argument operation. private void Emit(RegexOpcode op, int opd1) { - if (RegexCode.OpcodeBacktracks(op)) + if (RegexInterpreterCode.OpcodeBacktracks(op)) { _trackCount++; } @@ -180,7 +164,7 @@ private void Emit(RegexOpcode op, int opd1) /// Emits a two-argument operation. private void Emit(RegexOpcode op, int opd1, int opd2) { - if (RegexCode.OpcodeBacktracks(op)) + if (RegexInterpreterCode.OpcodeBacktracks(op)) { _trackCount++; } @@ -270,7 +254,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex) Emit(RegexOpcode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexOpcode.Lazybranch, 0); - Emit(RegexOpcode.TestBackreference, RegexParser.MapCaptureNumber(node.M, _caps)); + Emit(RegexOpcode.TestBackreference, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping)); Emit(RegexOpcode.Forejump); break; } @@ -368,7 +352,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex) break; case RegexNodeKind.Capture | AfterChild: - Emit(RegexOpcode.Capturemark, RegexParser.MapCaptureNumber(node.M, _caps), RegexParser.MapCaptureNumber(node.N, _caps)); + Emit(RegexOpcode.Capturemark, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping), RegexParser.MapCaptureNumber(node.N, _tree.CaptureNumberSparseMapping)); break; case RegexNodeKind.PositiveLookaround | BeforeChild: @@ -448,7 +432,7 @@ private void EmitFragment(RegexNodeKind nodeType, RegexNode node, int curIndex) break; case RegexNodeKind.Backreference: - Emit((RegexOpcode)node.Kind | bits, RegexParser.MapCaptureNumber(node.M, _caps)); + Emit((RegexOpcode)node.Kind | bits, RegexParser.MapCaptureNumber(node.M, _tree.CaptureNumberSparseMapping)); break; case RegexNodeKind.Nothing: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs index 6d2cd1755601d1..5a067ba7ff5356 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeConverter.cs @@ -17,7 +17,7 @@ internal sealed class RegexNodeConverter /// The culture to use for IgnoreCase comparisons. private readonly CultureInfo _culture; /// Capture information. - private readonly Hashtable? _caps; + private readonly Hashtable? _captureSparseMapping; /// The builder to use to create the nodes. internal readonly SymbolicRegexBuilder _builder; @@ -26,10 +26,10 @@ internal sealed class RegexNodeConverter private Dictionary<(bool IgnoreCase, string Set), BDD>? _setBddCache; /// Constructs a regex to symbolic finite automata converter - public RegexNodeConverter(CultureInfo culture, Hashtable? caps) + public RegexNodeConverter(CultureInfo culture, Hashtable? captureSparseMapping) { _culture = culture; - _caps = caps; + _captureSparseMapping = captureSparseMapping; _builder = new SymbolicRegexBuilder(CharSetSolver.Instance); } @@ -133,11 +133,7 @@ public SymbolicRegexNode ConvertToSymbolicRegexNode(RegexNode node, bool tr // Other constructs case RegexNodeKind.Capture when node.N == -1: // N == -1 because balancing groups aren't supported - int captureNum; - if (_caps == null || !_caps.TryGetValue(node.M, out captureNum)) - { - captureNum = node.M; - } + int captureNum = RegexParser.MapCaptureNumber(node.M, _captureSparseMapping); return _builder.CreateCapture(ConvertToSymbolicRegexNode(node.Child(0), tryCreateFixedLengthMarker), captureNum); case RegexNodeKind.Empty: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 905efcdc3eaab3..eb1edb977d80b8 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -147,7 +147,7 @@ private TSetType GetMinterm(int c) } /// Constructs matcher for given symbolic regex. - internal SymbolicRegexMatcher(SymbolicRegexNode sr, RegexCode code, BDD[] minterms, TimeSpan matchTimeout) + internal SymbolicRegexMatcher(SymbolicRegexNode sr, RegexTree regexTree, BDD[] minterms, TimeSpan matchTimeout) { Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); @@ -161,17 +161,17 @@ internal SymbolicRegexMatcher(SymbolicRegexNode sr, RegexCode code, BD BVAlgebra bv => bv._classifier, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; - _capsize = code.CapSize; + _capsize = regexTree.CaptureCount; - if (code.Tree.MinRequiredLength == code.FindOptimizations.MaxPossibleLength) + if (regexTree.FindOptimizations.MinRequiredLength == regexTree.FindOptimizations.MaxPossibleLength) { - _fixedMatchLength = code.Tree.MinRequiredLength; + _fixedMatchLength = regexTree.FindOptimizations.MinRequiredLength; } - if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && - code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. + if (regexTree.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && + regexTree.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { - _findOpts = code.FindOptimizations; + _findOpts = regexTree.FindOptimizations; } // Determine the number of initial states. If there's no anchor, only the default previous diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs index c18b9433110220..8d152f92275940 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexRunnerFactory.cs @@ -13,7 +13,7 @@ internal sealed class SymbolicRegexRunnerFactory : RegexRunnerFactory internal readonly SymbolicRegexMatcher _matcher; /// Initializes the factory. - public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) + public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { // RightToLeft and ECMAScript are currently not supported in conjunction with NonBacktracking. if ((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) != 0) @@ -23,9 +23,9 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan (options & RegexOptions.RightToLeft) != 0 ? nameof(RegexOptions.RightToLeft) : nameof(RegexOptions.ECMAScript))); } - var converter = new RegexNodeConverter(culture, code.Caps); + var converter = new RegexNodeConverter(culture, regexTree.CaptureNumberSparseMapping); CharSetSolver solver = CharSetSolver.Instance; - SymbolicRegexNode root = converter.ConvertToSymbolicRegexNode(code.Tree.Root, tryCreateFixedLengthMarker: true); + SymbolicRegexNode root = converter.ConvertToSymbolicRegexNode(regexTree.Root, tryCreateFixedLengthMarker: true); BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) @@ -42,7 +42,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan // Convert the BDD-based AST to BV-based AST SymbolicRegexNode rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(rootBV, code, minterms, matchTimeout); + _matcher = new SymbolicRegexMatcher(rootBV, regexTree, minterms, matchTimeout); } else { @@ -58,7 +58,7 @@ public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); - _matcher = new SymbolicRegexMatcher(root64, code, minterms, matchTimeout); + _matcher = new SymbolicRegexMatcher(root64, regexTree, minterms, matchTimeout); } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj index 38b247a35c2633..35707651e3b74e 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj @@ -32,7 +32,6 @@ - diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs index f36bac05061f76..eb4a4ffa74d967 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs @@ -130,8 +130,8 @@ public void LiteralAfterLoop(string pattern, RegexOptions options, int expectedM private static RegexFindOptimizations ComputeOptimizations(string pattern, RegexOptions options) { - RegexCode code = RegexWriter.Write(RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture), CultureInfo.InvariantCulture); - return new RegexFindOptimizations(code.Tree, CultureInfo.InvariantCulture); + RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture); + return new RegexFindOptimizations(tree.Root, options, CultureInfo.InvariantCulture); } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs similarity index 83% rename from src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexReductionTests.cs rename to src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 69a2fc5c33accb..08930bdfd2bcc4 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -1,100 +1,13 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Reflection; +using System.Globalization; using Xunit; namespace System.Text.RegularExpressions.Tests { - [SkipOnTargetFramework(TargetFrameworkMonikers.NetFramework, "Many of these optimizations don't exist in .NET Framework.")] - [ConditionalClass(typeof(PlatformDetection), nameof(PlatformDetection.IsNotBuiltWithAggressiveTrimming))] public class RegexReductionTests { - // These tests depend on using reflection to access internals of Regex in order to validate - // if, when, and how various optimizations are being employed. As implementation details - // change, these tests will need to be updated as well. Note, too, that Compiled Regexes - // null out the _code field being accessed here, so this mechanism won't work to validate - // Compiled, which also means it won't work to validate optimizations only enabled - // when using Compiled, such as auto-atomicity for the last node in a regex. - - private static readonly FieldInfo s_regexCode; - private static readonly FieldInfo s_regexCodeCodes; - private static readonly FieldInfo s_regexCodeTree; - private static readonly FieldInfo s_regexCodeFindOptimizations; - private static readonly PropertyInfo s_regexCodeFindOptimizationsMaxPossibleLength; - private static readonly FieldInfo s_regexCodeTreeMinRequiredLength; - - static RegexReductionTests() - { - if (PlatformDetection.IsNetFramework || PlatformDetection.IsBuiltWithAggressiveTrimming) - { - // These members may not exist or may have been trimmed away, and the tests won't run. - return; - } - - s_regexCode = typeof(Regex).GetField("_code", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCode); - - s_regexCodeFindOptimizations = s_regexCode.FieldType.GetField("FindOptimizations", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCodeFindOptimizations); - - s_regexCodeFindOptimizationsMaxPossibleLength = s_regexCodeFindOptimizations.FieldType.GetProperty("MaxPossibleLength", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCodeFindOptimizationsMaxPossibleLength); - - s_regexCodeCodes = s_regexCode.FieldType.GetField("Codes", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCodeCodes); - - s_regexCodeTree = s_regexCode.FieldType.GetField("Tree", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCodeTree); - - s_regexCodeTreeMinRequiredLength = s_regexCodeTree.FieldType.GetField("MinRequiredLength", BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance); - Assert.NotNull(s_regexCodeTreeMinRequiredLength); - } - - private static string GetRegexCodes(Regex r) - { - object code = s_regexCode.GetValue(r); - Assert.NotNull(code); - string result = code.ToString(); - - // In release builds, the above ToString won't be informative. - // Also include the numerical codes, which are not as comprehensive - // but which exist in release builds as well. - int[] codes = s_regexCodeCodes.GetValue(code) as int[]; - Assert.NotNull(codes); - result += Environment.NewLine + string.Join(", ", codes); - - return result; - } - - private static int GetMinRequiredLength(Regex r) - { - object code = s_regexCode.GetValue(r); - Assert.NotNull(code); - - object tree = s_regexCodeTree.GetValue(code); - Assert.NotNull(tree); - - object minRequiredLength = s_regexCodeTreeMinRequiredLength.GetValue(tree); - Assert.IsType(minRequiredLength); - - return (int)minRequiredLength; - } - - private static int? GetMaxPossibleLength(Regex r) - { - object code = s_regexCode.GetValue(r); - Assert.NotNull(code); - - object findOpts = s_regexCodeFindOptimizations.GetValue(code); - Assert.NotNull(findOpts); - - object maxPossibleLength = s_regexCodeFindOptimizationsMaxPossibleLength.GetValue(findOpts); - Assert.True(maxPossibleLength is null || maxPossibleLength is int); - - return (int?)maxPossibleLength; - } - [Theory] // Two greedy one loops [InlineData("a*a*", "a*")] @@ -390,7 +303,7 @@ private static int GetMinRequiredLength(Regex r) [InlineData("abcd|abef", "ab(?>cd|ef)")] [InlineData("abcd|aefg", "a(?>bcd|efg)")] [InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")] - [InlineData("abcde|abcdef", "abcde(?>|f)")] + // [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree [InlineData("abcdef|abcde", "abcde(?>f|)")] [InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")] [InlineData("(ab|ab*)bc", "(a(?:b|b*))bc")] @@ -441,7 +354,7 @@ private static int GetMinRequiredLength(Regex r) [InlineData("[ab]*[^a]*", "[ab]*(?>[^a]*)")] [InlineData("[aa]*[^a]*", "(?>a*)(?>[^a]*)")] [InlineData("a??", "")] - [InlineData("(abc*?)", "(ab)")] + //[InlineData("(abc*?)", "(ab)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree [InlineData("a{1,3}?", "a{1,4}?")] [InlineData("a{2,3}?", "a{2}")] [InlineData("bc(a){1,3}?", "bc(a){1,2}?")] @@ -474,13 +387,15 @@ private static int GetMinRequiredLength(Regex r) [InlineData("(?i)\\d", "\\d")] [InlineData("(?i).", ".")] [InlineData("(?i)\\$", "\\$")] - public void PatternsReduceIdentically(string pattern1, string pattern2) + public void PatternsReduceIdentically(string actual, string expected) { - string result1 = GetRegexCodes(new Regex(pattern1)); - string result2 = GetRegexCodes(new Regex(pattern2)); - if (result1 != result2) + // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project. + + string actualStr = RegexParser.Parse(actual, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString(); + string expectedStr = RegexParser.Parse(expected, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString(); + if (actualStr != expectedStr) { - throw new Xunit.Sdk.EqualException(result2, result1); + throw new Xunit.Sdk.EqualException(actualStr, expectedStr); } } @@ -554,13 +469,15 @@ public void PatternsReduceIdentically(string pattern1, string pattern2) [InlineData("a*(?(xyz)acd|efg)", "(?>a*)(?(xyz)acd|efg)")] [InlineData("a*(?(xyz)bcd|afg)", "(?>a*)(?(xyz)bcd|afg)")] [InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")] - public void PatternsReduceDifferently(string pattern1, string pattern2) + public void PatternsReduceDifferently(string actual, string expected) { - string result1 = GetRegexCodes(new Regex(pattern1)); - string result2 = GetRegexCodes(new Regex(pattern2)); - if (result1 == result2) + // NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project. + + string actualStr = RegexParser.Parse(actual, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString(); + string expectedStr = RegexParser.Parse(expected, RegexOptions.None, CultureInfo.InvariantCulture).Root.ToString(); + if (actualStr == expectedStr) { - throw new Xunit.Sdk.EqualException(result2, result1); + throw new Xunit.Sdk.NotEqualException(actualStr, expectedStr); } } @@ -632,29 +549,33 @@ public void PatternsReduceDifferently(string pattern1, string pattern2) [InlineData(@"abcdef", RegexOptions.RightToLeft, 6, null)] public void MinMaxLengthIsCorrect(string pattern, RegexOptions options, int expectedMin, int? expectedMax) { - var r = new Regex(pattern, options); - Assert.Equal(expectedMin, GetMinRequiredLength(r)); + RegexTree tree = RegexParser.Parse(pattern, options, CultureInfo.InvariantCulture); + + Assert.Equal(expectedMin, tree.FindOptimizations.MinRequiredLength); + if (!pattern.EndsWith("$", StringComparison.Ordinal) && !pattern.EndsWith(@"\Z", StringComparison.OrdinalIgnoreCase)) { // MaxPossibleLength is currently only computed/stored if there's a trailing End{Z} anchor as the max length is otherwise unused - r = new Regex($"(?:{pattern})$", options); + tree = RegexParser.Parse($"(?:{pattern})$", options, CultureInfo.InvariantCulture); } - Assert.Equal(expectedMax, GetMaxPossibleLength(r)); + + Assert.Equal(expectedMax, tree.FindOptimizations.MaxPossibleLength); } [Fact] public void MinMaxLengthIsCorrect_HugeDepth() { const int Depth = 10_000; - var r = new Regex($"{new string('(', Depth)}a{new string(')', Depth)}$"); // too deep for analysis on some platform default stack sizes + RegexTree tree = RegexParser.Parse($"{new string('(', Depth)}a{new string(')', Depth)}$", RegexOptions.None, CultureInfo.InvariantCulture); // too deep for analysis on some platform default stack sizes + + int minRequiredLength = tree.FindOptimizations.MinRequiredLength; - int minRequiredLength = GetMinRequiredLength(r); Assert.True( minRequiredLength == 1 /* successfully analyzed */ || minRequiredLength == 0 /* ran out of stack space to complete analysis */, $"Expected 1 or 0, got {minRequiredLength}"); - int? maxPossibleLength = GetMaxPossibleLength(r); + int? maxPossibleLength = tree.FindOptimizations.MaxPossibleLength; Assert.True( maxPossibleLength == 1 /* successfully analyzed */ || maxPossibleLength is null /* ran out of stack space to complete analysis */, $"Expected 1 or null, got {maxPossibleLength}"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexTreeAnalyzerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexTreeAnalyzerTests.cs index ce6f323d0c1fb2..8a1f35118345b2 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexTreeAnalyzerTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexTreeAnalyzerTests.cs @@ -12,18 +12,18 @@ public class RegexTreeAnalyzerTests [Fact] public void SimpleString() { - (RegexCode code, AnalysisResults analysis) = Analyze("abc"); + (RegexTree tree, AnalysisResults analysis) = Analyze("abc"); - RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); + RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode abc = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Multi, atomicByAncestor: true, backtracks: false, captures: false); } [Fact] public void AlternationWithCaptures() { - (RegexCode code, AnalysisResults analysis) = Analyze("abc|d(e)f|(ghi)"); + (RegexTree tree, AnalysisResults analysis) = Analyze("abc|d(e)f|(ghi)"); - RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); + RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode implicitAtomic = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Atomic, atomicByAncestor: true, backtracks: false, captures: true); RegexNode alternation = AssertNode(analysis, implicitAtomic.Child(0), RegexNodeKind.Alternate, atomicByAncestor: true, backtracks: false, captures: true); @@ -43,9 +43,9 @@ public void AlternationWithCaptures() [Fact] public void LoopsReducedWithAutoAtomic() { - (RegexCode code, AnalysisResults analysis) = Analyze("a*(b*)c*"); + (RegexTree tree, AnalysisResults analysis) = Analyze("a*(b*)c*"); - RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); + RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode concat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: false, captures: true); RegexNode aStar = AssertNode(analysis, concat.Child(0), RegexNodeKind.Oneloopatomic, atomicByAncestor: false, backtracks: false, captures: false); @@ -59,9 +59,9 @@ public void LoopsReducedWithAutoAtomic() [Fact] public void AtomicGroupAroundBacktracking() { - (RegexCode code, AnalysisResults analysis) = Analyze("[ab]*(?>[bc]*[cd])[ef]"); + (RegexTree tree, AnalysisResults analysis) = Analyze("[ab]*(?>[bc]*[cd])[ef]"); - RegexNode rootCapture = AssertNode(analysis, code.Tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true); + RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true); RegexNode rootConcat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: true, captures: false); RegexNode abStar = AssertNode(analysis, rootConcat.Child(0), RegexNodeKind.Setloop, atomicByAncestor: false, backtracks: true, captures: false); @@ -76,10 +76,10 @@ public void AtomicGroupAroundBacktracking() RegexNode cd = AssertNode(analysis, atomicConcat.Child(1), RegexNodeKind.Set, atomicByAncestor: true, backtracks: false, captures: false); } - private static (RegexCode Code, AnalysisResults Analysis) Analyze(string pattern) + private static (RegexTree Tree, AnalysisResults Analysis) Analyze(string pattern) { - RegexCode code = RegexWriter.Write(RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture), CultureInfo.InvariantCulture); - return (code, RegexTreeAnalyzer.Analyze(code)); + RegexTree tree = RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture); + return (tree, RegexTreeAnalyzer.Analyze(tree)); } private static RegexNode AssertNode(AnalysisResults analysis, RegexNode node, RegexNodeKind kind, bool atomicByAncestor, bool backtracks, bool captures) diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj index a9a79b20404d55..a8b4ea21ee1e0a 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/System.Text.RegularExpressions.Unit.Tests.csproj @@ -8,11 +8,14 @@ true true true + $(DefineConstants);DEBUG + + @@ -23,7 +26,7 @@ - +