From 5c982efd41aa7c01735277553ae87e8f0aad77dd Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sun, 28 Aug 2022 13:52:51 -0400 Subject: [PATCH 1/3] Stop coalescing some adjacent Regex atomic loops We walk concatenations in order to combine adjacent loops, e.g. `a+a+a+` becomes `a{3,}`. We also combine loops with individual items that are compatible, e.g. `a+ab` becomes `a{2,}b`. However, we're doing these operations on atomic loops as well, which is sometimes wrong. Since an atomic loop consumes as much as possible and never gives anything back, combining it with a subsequent loop will end up essentially ignoring any minimum specified in the latter loop. We thus can't combine atomic loops if the second loop has a minimum; this includes the case where the second "loop" is just an individual item. --- .../Text/RegularExpressions/RegexNode.cs | 37 ++++++--- .../FunctionalTests/Regex.Match.Tests.cs | 9 +++ .../tests/UnitTests/RegexReductionTests.cs | 81 ++++++++++--------- 3 files changed, 75 insertions(+), 52 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index ee8e487dd96ad9..89ac47b5705154 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1606,22 +1606,33 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax) // Coalescing a loop with its same type case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == currentNode.Kind && currentNode.Ch == nextNode.Ch: case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == currentNode.Kind && currentNode.Str == nextNode.Str: - if (CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N)) + if (nextNode.M > 0 && + currentNode.Kind is RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic) { - currentNode.M += nextNode.M; - if (currentNode.N != int.MaxValue) - { - currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N; - } - next++; - continue; + // Atomic loops can only be combined if the second loop has no lower bound, as if it has a lower bound, + // combining them changes behavior. Uncombined, the first loop can consume all matching items; + // the second loop might then not be able to meet its minimum and fail. But if they're combined, the combined + // minimum of the sole loop could now be met, introducing matches where there shouldn't have been any. + break; } - break; + + if (!CanCombineCounts(currentNode.M, currentNode.N, nextNode.M, nextNode.N)) + { + break; + } + + currentNode.M += nextNode.M; + if (currentNode.N != int.MaxValue) + { + currentNode.N = nextNode.N == int.MaxValue ? int.MaxValue : currentNode.N + nextNode.N; + } + next++; + continue; // Coalescing a loop with an additional item of the same type - case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch: - case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch: - case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str: + case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.One && currentNode.Ch == nextNode.Ch: + case RegexNodeKind.Notoneloop or RegexNodeKind.Notonelazy when nextNode.Kind == RegexNodeKind.Notone && currentNode.Ch == nextNode.Ch: + case RegexNodeKind.Setloop or RegexNodeKind.Setlazy when nextNode.Kind == RegexNodeKind.Set && currentNode.Str == nextNode.Str: if (CanCombineCounts(currentNode.M, currentNode.N, 1, 1)) { currentNode.M++; @@ -1635,7 +1646,7 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax) break; // Coalescing a loop with a subsequent string - case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]: + case RegexNodeKind.Oneloop or RegexNodeKind.Onelazy when nextNode.Kind == RegexNodeKind.Multi && currentNode.Ch == nextNode.Str![0]: { // Determine how many of the multi's characters can be combined. // We already checked for the first, so we know it's at least one. diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 02182bd87ede6e..9f73f9f1c30e88 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -149,6 +149,15 @@ public static IEnumerable Match_MemberData() yield return (Case("(?>[^z]+)z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz"); yield return (Case("(?>(?>[^z]+))z"), "zzzzxyxyxyz123", options, 4, 9, true, "xyxyxyz"); yield return (Case("(?>[^z]*)z123"), "zzzzxyxyxyz123", options, 4, 10, true, "xyxyxyz123"); + yield return (Case("(?>a*)a"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>a*)a+"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>a+)a+"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>.*)."), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>.*).+"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>.+).+"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>\\w*)\\w"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>\\w*)\\w+"), "aaa", options, 0, 3, false, ""); + yield return (Case("(?>\\w+)\\w+"), "aaa", options, 0, 3, false, ""); yield return (Case("(?>[^12]+)1"), "121231", options, 0, 6, true, "31"); yield return (Case("(?>[^123]+)1"), "12312341", options, 0, 8, true, "41"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index b21b26071bc2e6..e6d9117745ca41 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -50,26 +50,14 @@ public class RegexReductionTests // Two atomic one loops [InlineData("(?>a*)(?>a*)", "(?>a*)")] [InlineData("(?>a*)(?>(?:a*))", "(?>a*)")] - [InlineData("(?>a*)(?>a+)", "(?>a+)")] [InlineData("(?>a*)(?>a?)", "(?>a*)")] - [InlineData("(?>a*)(?>a{1,3})", "(?>a+)")] [InlineData("(?>a+)(?>a*)", "(?>a+)")] - [InlineData("(?>a+)(?>a+)", "(?>a{2,})")] [InlineData("(?>a+)(?>a?)", "(?>a+)")] - [InlineData("(?>a+)(?>a{1,3})", "(?>a{2,})")] [InlineData("(?>a?)(?>a*)", "(?>a*)")] - [InlineData("(?>a?)(?>a+)", "(?>a+)")] [InlineData("(?>a?)(?>a?)", "(?>a{0,2})")] - [InlineData("(?>a?)(?>a{1,3})", "(?>a{1,4})")] [InlineData("(?>a{1,3})(?>a*)", "(?>a+)")] - [InlineData("(?>a{1,3})(?>a+)", "(?>a{2,})")] [InlineData("(?>a{1,3})(?>a?)", "(?>a{1,4})")] - [InlineData("(?>a{1,3})(?>a{1,3})", "(?>a{2,6})")] - // Atomic one loop and one - [InlineData("(?>a*)a", "(?>a+)")] - [InlineData("(?>a+)a", "(?>a{2,})")] - [InlineData("(?>a?)a", "(?>a{1,2})")] - [InlineData("(?>a{1,3})a", "(?>a{2,4})")] + // One and atomic one loop [InlineData("a(?>a*)", "(?>a+)")] [InlineData("a(?>a+)", "(?>a{2,})")] [InlineData("a(?>a?)", "(?>a{1,2})")] @@ -136,21 +124,13 @@ public class RegexReductionTests [InlineData("[^a]{1,3}?[^a]{1,3}?", "[^a]{2,6}?")] // Two atomic notone loops [InlineData("(?>[^a]*)(?>[^a]*)", "(?>[^a]*)")] - [InlineData("(?>[^a]*)(?>[^a]+)", "(?>[^a]+)")] [InlineData("(?>[^a]*)(?>[^a]?)", "(?>[^a]*)")] - [InlineData("(?>[^a]*)(?>[^a]{1,3})", "(?>[^a]+)")] [InlineData("(?>[^a]+)(?>[^a]*)", "(?>[^a]+)")] - [InlineData("(?>[^a]+)(?>[^a]+)", "(?>[^a]{2,})")] [InlineData("(?>[^a]+)(?>[^a]?)", "(?>[^a]+)")] - [InlineData("(?>[^a]+)(?>[^a]{1,3})", "(?>[^a]{2,})")] [InlineData("(?>[^a]?)(?>[^a]*)", "(?>[^a]*)")] - [InlineData("(?>[^a]?)(?>[^a]+)", "(?>[^a]+)")] [InlineData("(?>[^a]?)(?>[^a]?)", "(?>[^a]{0,2})")] - [InlineData("(?>[^a]?)(?>[^a]{1,3})", "(?>[^a]{1,4})")] [InlineData("(?>[^a]{1,3})(?>[^a]*)", "(?>[^a]+)")] - [InlineData("(?>[^a]{1,3})(?>[^a]+)", "(?>[^a]{2,})")] [InlineData("(?>[^a]{1,3})(?>[^a]?)", "(?>[^a]{1,4})")] - [InlineData("(?>[^a]{1,3})(?>[^a]{1,3})", "(?>[^a]{2,6})")] // Greedy notone loop and notone [InlineData("[^a]*[^a]", "[^a]+")] [InlineData("[^a]+[^a]", "[^a]{2,}")] @@ -169,11 +149,7 @@ public class RegexReductionTests [InlineData("[^a][^a]+?", "[^a]{2,}?")] [InlineData("[^a][^a]??", "[^a]{1,2}?")] [InlineData("[^a][^a]{1,3}?", "[^a]{2,4}?")] - // Atomic notone loop and notone - [InlineData("(?>[^a]*)[^a]", "(?>[^a]+)")] - [InlineData("(?>[^a]+)[^a]", "(?>[^a]{2,})")] - [InlineData("(?>[^a]?)[^a]", "(?>[^a]{1,2})")] - [InlineData("(?>[^a]{1,3})[^a]", "(?>[^a]{2,4})")] + // Notone and atomic notone loop [InlineData("[^a](?>[^a]*)", "(?>[^a]+)")] [InlineData("[^a](?>[^a]+)", "(?>[^a]{2,})")] [InlineData("[^a](?>[^a]?)", "(?>[^a]{1,2})")] @@ -206,11 +182,7 @@ public class RegexReductionTests [InlineData("[0-9][0-9]+", "[0-9]{2,}")] [InlineData("[0-9][0-9]?", "[0-9]{1,2}")] [InlineData("[0-9][0-9]{1,3}", "[0-9]{2,4}")] - // Atomic set loop and set - [InlineData("(?>[0-9]*)[0-9]", "(?>[0-9]+)")] - [InlineData("(?>[0-9]+)[0-9]", "(?>[0-9]{2,})")] - [InlineData("(?>[0-9]?)[0-9]", "(?>[0-9]{1,2})")] - [InlineData("(?>[0-9]{1,3})[0-9]", "(?>[0-9]{2,4})")] + // Set and atomic set loop [InlineData("[0-9](?>[0-9]*)", "(?>[0-9]+)")] [InlineData("[0-9](?>[0-9]+)", "(?>[0-9]{2,})")] [InlineData("[0-9](?>[0-9]?)", "(?>[0-9]{1,2})")] @@ -234,21 +206,13 @@ public class RegexReductionTests [InlineData("[0-9]{1,3}?[0-9]{1,3}?", "[0-9]{2,6}?")] // Two atomic set loops [InlineData("(?>[0-9]*)(?>[0-9]*)", "(?>[0-9]*)")] - [InlineData("(?>[0-9]*)(?>[0-9]+)", "(?>[0-9]+)")] [InlineData("(?>[0-9]*)(?>[0-9]?)", "(?>[0-9]*)")] - [InlineData("(?>[0-9]*)(?>[0-9]{1,3})", "(?>[0-9]+)")] [InlineData("(?>[0-9]+)(?>[0-9]*)", "(?>[0-9]+)")] - [InlineData("(?>[0-9]+)(?>[0-9]+)", "(?>[0-9]{2,})")] [InlineData("(?>[0-9]+)(?>[0-9]?)", "(?>[0-9]+)")] - [InlineData("(?>[0-9]+)(?>[0-9]{1,3})", "(?>[0-9]{2,})")] [InlineData("(?>[0-9]?)(?>[0-9]*)", "(?>[0-9]*)")] - [InlineData("(?>[0-9]?)(?>[0-9]+)", "(?>[0-9]+)")] [InlineData("(?>[0-9]?)(?>[0-9]?)", "(?>[0-9]{0,2})")] - [InlineData("(?>[0-9]?)(?>[0-9]{1,3})", "(?>[0-9]{1,4})")] [InlineData("(?>[0-9]{1,3})(?>[0-9]*)", "(?>[0-9]+)")] - [InlineData("(?>[0-9]{1,3})(?>[0-9]+)", "(?>[0-9]{2,})")] [InlineData("(?>[0-9]{1,3})(?>[0-9]?)", "(?>[0-9]{1,4})")] - [InlineData("(?>[0-9]{1,3})(?>[0-9]{1,3})", "(?>[0-9]{2,6})")] // Lazy set loop and set [InlineData("[0-9]*?[0-9]", "[0-9]+?")] [InlineData("[0-9]+?[0-9]", "[0-9]{2,}?")] @@ -423,6 +387,42 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData("a*?a*", "a*")] [InlineData("a*[^a]*", "a*")] [InlineData("[^a]*a*", "a*")] + [InlineData("(?>a*)(?>a+)", "(?>a+)")] + [InlineData("(?>a*)(?>a{1,3})", "(?>a+)")] + [InlineData("(?>a+)(?>a+)", "(?>a{2,})")] + [InlineData("(?>a+)(?>a{1,3})", "(?>a{2,})")] + [InlineData("(?>a?)(?>a+)", "(?>a+)")] + [InlineData("(?>a?)(?>a{1,3})", "(?>a{1,4})")] + [InlineData("(?>a{1,3})(?>a+)", "(?>a{2,})")] + [InlineData("(?>a{1,3})(?>a{1,3})", "(?>a{2,6})")] + [InlineData("(?>[^a]*)(?>[^a]+)", "(?>[^a]+)")] + [InlineData("(?>[^a]*)(?>[^a]{1,3})", "(?>[^a]+)")] + [InlineData("(?>[^a]+)(?>[^a]+)", "(?>[^a]{2,})")] + [InlineData("(?>[^a]+)(?>[^a]{1,3})", "(?>[^a]{2,})")] + [InlineData("(?>[^a]?)(?>[^a]+)", "(?>[^a]+)")] + [InlineData("(?>[^a]?)(?>[^a]{1,3})", "(?>[^a]{1,4})")] + [InlineData("(?>[^a]{1,3})(?>[^a]+)", "(?>[^a]{2,})")] + [InlineData("(?>[^a]{1,3})(?>[^a]{1,3})", "(?>[^a]{2,6})")] + [InlineData("(?>[0-9]*)(?>[0-9]+)", "(?>[0-9]+)")] + [InlineData("(?>[0-9]*)(?>[0-9]{1,3})", "(?>[0-9]+)")] + [InlineData("(?>[0-9]+)(?>[0-9]+)", "(?>[0-9]{2,})")] + [InlineData("(?>[0-9]+)(?>[0-9]{1,3})", "(?>[0-9]{2,})")] + [InlineData("(?>[0-9]?)(?>[0-9]+)", "(?>[0-9]+)")] + [InlineData("(?>[0-9]?)(?>[0-9]{1,3})", "(?>[0-9]{1,4})")] + [InlineData("(?>[0-9]{1,3})(?>[0-9]+)", "(?>[0-9]{2,})")] + [InlineData("(?>[0-9]{1,3})(?>[0-9]{1,3})", "(?>[0-9]{2,6})")] + [InlineData("(?>a*)a", "(?>a+)")] + [InlineData("(?>a+)a", "(?>a{2,})")] + [InlineData("(?>a?)a", "(?>a{1,2})")] + [InlineData("(?>a{1,3})a", "(?>a{2,4})")] + [InlineData("(?>[^a]*)[^a]", "(?>[^a]+)")] + [InlineData("(?>[^a]+)[^a]", "(?>[^a]{2,})")] + [InlineData("(?>[^a]?)[^a]", "(?>[^a]{1,2})")] + [InlineData("(?>[^a]{1,3})[^a]", "(?>[^a]{2,4})")] + [InlineData("(?>[0-9]*)[0-9]", "(?>[0-9]+)")] + [InlineData("(?>[0-9]+)[0-9]", "(?>[0-9]{2,})")] + [InlineData("(?>[0-9]?)[0-9]", "(?>[0-9]{1,2})")] + [InlineData("(?>[0-9]{1,3})[0-9]", "(?>[0-9]{2,4})")] [InlineData("a{2147483646}a", "a{2147483647}")] [InlineData("a{2147483647}a", "a{2147483647}")] [InlineData("a{0,2147483646}a", "a{0,2147483647}")] @@ -505,6 +505,9 @@ public void PatternsReduceDifferently(string actual, string expected) [InlineData(@"a??", RegexOptions.None, 0, 1)] [InlineData(@"a+", RegexOptions.None, 1, null)] [InlineData(@"a+?", RegexOptions.None, 1, null)] + [InlineData(@"(?>a*)a", RegexOptions.None, 1, null)] + [InlineData(@"(?>a*)a+", RegexOptions.None, 1, null)] + [InlineData(@"(?>a*)a*", RegexOptions.None, 0, null)] [InlineData(@"a{2}", RegexOptions.None, 2, 2)] [InlineData(@"a{2}?", RegexOptions.None, 2, 2)] [InlineData(@"a{3,17}", RegexOptions.None, 3, 17)] From 3b7f47df53227dbff33ffa1114ed9e710e4353ea Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sun, 28 Aug 2022 14:24:03 -0400 Subject: [PATCH 2/3] Fix auto-atomicity handling of \w and \b We currently consider \w and \b non-overlapping, which allows a \w loop followed by a \b to be made atomic. The problem with this is that \b is zero-width, and it could be followed by something that does overlap with the \w. When matching at a location that is a word boundary, it is possible the first loop could give up something that matches the subsequent construct, and thus it can't be made atomic. (We could probably restrict this further to still allow atomicity when the first loop has a non-0 lower bound, but it doesn't appear to be worth the complication.) --- .../Text/RegularExpressions/RegexNode.cs | 18 +++++------ .../Regex.MultipleMatches.Tests.cs | 30 +++++++++++++++++++ .../tests/UnitTests/RegexReductionTests.cs | 4 +++ 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 89ac47b5705154..3ed8102ed49d1f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2067,15 +2067,15 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]: case RegexNodeKind.End: case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n': - case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch): - case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch): - case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch): - case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch): return true; case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch: case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch: case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!): + case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch): + case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch): + case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch): + case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch): // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. break; @@ -2114,14 +2114,14 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!): case RegexNodeKind.End: case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!): - case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass: - case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass: - case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass: - case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass: return true; case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!): case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!): + case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass: + case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass: + case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass: + case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass: // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well. break; @@ -2136,8 +2136,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i // We only get here if the node could be made atomic based on subsequent but subsequent has a lower bound of zero // and thus we need to move subsequent to be the next node in sequence and loop around to try again. - Debug.Assert(subsequent.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy); - Debug.Assert(subsequent.M == 0); if (!iterateNullableSubsequent) { return false; diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs index 744fd174351171..bd0b1eab45ce37 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs @@ -276,6 +276,22 @@ public static IEnumerable Matches_TestData() } }; + yield return new object[] + { + engine, + @"\w*\b\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None, + new[] + { + new CaptureData("abc", 0, 3), + new CaptureData("def", 4, 3), + new CaptureData("ghij", 8, 4), + new CaptureData("kl", 13, 2), + new CaptureData("m", 16, 1), + new CaptureData("nop", 18, 3), + new CaptureData("qrstuv", 22, 6), + } + }; + if (!PlatformDetection.IsNetFramework) { // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075 @@ -294,6 +310,20 @@ public static IEnumerable Matches_TestData() if (!RegexHelpers.IsNonBacktracking(engine)) { + yield return new object[] + { + engine, + @"(\b(?!ab|nop)\w*\b)\w+", "abc def ghij kl m nop qrstuv", RegexOptions.None, + new[] + { + new CaptureData("def", 4, 3), + new CaptureData("ghij", 8, 4), + new CaptureData("kl", 13, 2), + new CaptureData("m", 16, 1), + new CaptureData("qrstuv", 22, 6), + } + }; + yield return new object[] { engine, diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index e6d9117745ca41..75aa24b667d4ef 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -470,6 +470,10 @@ public void PatternsReduceIdentically(string actual, string expected) [InlineData("(?:ab??){2}", "(?:a(?>b??)){2}")] [InlineData("(?:ab??){2, 3}", "(?:a(?>b??)){2, 3}")] [InlineData("ab??(b)", "a(?>b??)(b)")] + [InlineData(@"\w+\b\w+", @"(?>\w+)\b\w")] + [InlineData(@"\w*\b\w+", @"(?>\w*)\b\w+")] + [InlineData(@"\W+\B\W+", @"(?>\W+)\B\W")] + [InlineData(@"\W*\B\W+", @"(?>\W*)\B\W")] // Loops inside alternation constructs [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")] [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")] From 33ea2a1c655ff7068883bec5b60e3010c788ded3 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 29 Aug 2022 12:15:50 -0400 Subject: [PATCH 3/3] Add a few more tests --- .../tests/UnitTests/RegexReductionTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 75aa24b667d4ef..dbb22cf5026485 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -345,6 +345,8 @@ public class RegexReductionTests [InlineData("(?:w*)+\\.", "(?>w*)+\\.")] [InlineData("(a[bcd]e*)*fg", "(a[bcd](?>e*))*fg")] [InlineData("(\\w[bcd]\\s*)*fg", "(\\w[bcd](?>\\s*))*fg")] + [InlineData(@"\b(\w+)\b", @"\b((?>\w+))\b")] + [InlineData(@"\b(?:\w+)\b ", @"\b(?>\w+)\b ")] // Nothing handling [InlineData(@"\wabc(?!)def", "(?!)")] [InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]