Skip to content

Commit b67f978

Browse files
veanesdanmoseleystephentoub
authored
improved BDD Unicode table representation in NonBacktracking engine (#61142)
* improved BDD Unicode table representation in NonBacktracking engine * remove line Co-authored-by: Dan Moseley <[email protected]> * improved bounds-check elimination Co-authored-by: Stephen Toub <[email protected]> * clearer notation of numbers Co-authored-by: Dan Moseley <[email protected]> * fixed typo Co-authored-by: Dan Moseley <[email protected]> Co-authored-by: Stephen Toub <[email protected]>
1 parent c02a37c commit b67f978

File tree

9 files changed

+222
-107
lines changed

9 files changed

+222
-107
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/BDD.cs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,16 @@ internal sealed class BDD : IComparable
6969
/// </summary>
7070
private static readonly long[] s_trueRepresentation = new long[] { 1 };
7171

72+
/// <summary>
73+
/// Representation of False for compact serialization of BDDs.
74+
/// </summary>
75+
private static readonly byte[] s_falseRepresentationCompact = new byte[] { 0 };
76+
77+
/// <summary>
78+
/// Representation of True for compact serialization of BDDs.
79+
/// </summary>
80+
private static readonly byte[] s_trueRepresentationCompact = new byte[] { 1 };
81+
7282
internal BDD(int ordinal, BDD? one, BDD? zero)
7383
{
7484
One = one;
@@ -300,6 +310,49 @@ public long[] Serialize()
300310
return res;
301311
}
302312

313+
/// <summary>
314+
/// Serialize this BDD into a byte array.
315+
/// This method is not valid for MTBDDs where some elements may be negative.
316+
/// </summary>
317+
public byte[] SerializeToBytes()
318+
{
319+
if (IsEmpty)
320+
return s_falseRepresentationCompact;
321+
322+
if (IsFull)
323+
return s_trueRepresentationCompact;
324+
325+
// in other cases make use of the general serializer to long[]
326+
long[] serialized = Serialize();
327+
328+
// get the maximal element from the array
329+
long m = 0;
330+
for (int i = 0; i < serialized.Length; i++)
331+
{
332+
// make sure this serialization is not applied to MTBDDs
333+
Debug.Assert(serialized[i] > 0);
334+
m = Math.Max(serialized[i], m);
335+
}
336+
337+
// k is the number of bytes needed to represent the maximal element
338+
int k = m <= 0xFFFF ? 2 : (m <= 0xFF_FFFF ? 3 : (m <= 0xFFFF_FFFF ? 4 : (m <= 0xFF_FFFF_FFFF ? 5 : (m <= 0xFFFF_FFFF_FFFF ? 6 : (m <= 0xFF_FFFF_FFFF_FFFF ? 7 : 8)))));
339+
340+
// the result will contain k as the first element and the number of serialized elements times k
341+
byte[] result = new byte[(k * serialized.Length) + 1];
342+
result[0] = (byte)k;
343+
for (int i=0; i < serialized.Length; i += 1)
344+
{
345+
long serialized_i = serialized[i];
346+
// add the serialized longs as k-byte subsequences
347+
for (int j = 1; j <= k; j++)
348+
{
349+
result[(i * k) + j] = (byte)serialized_i;
350+
serialized_i = serialized_i >> 8;
351+
}
352+
}
353+
return result;
354+
}
355+
303356
/// <summary>
304357
/// Recreates a BDD from a ulong array that has been created using Serialize.
305358
/// Is executed using a lock on algebra (if algebra != null) in a single thread mode.
@@ -353,6 +406,70 @@ public static BDD Deserialize(long[] arcs, BDDAlgebra algebra)
353406
return nodes[k - 1];
354407
}
355408

409+
/// <summary>
410+
/// Recreates a BDD from a byte array that has been created using SerializeToBytes.
411+
/// </summary>
412+
public static BDD Deserialize(byte[] bytes, BDDAlgebra algebra)
413+
{
414+
if (bytes.Length == 1)
415+
{
416+
return bytes[0] == 0 ? False : True;
417+
}
418+
419+
// here bytes represents an array of longs with k = the number of bytes used per long
420+
int k = (int)bytes[0];
421+
422+
// gets the i'th element from the underlying array of longs represented by bytes
423+
long Get(int i)
424+
{
425+
long l = 0;
426+
for (int j = k; j > 0; j--)
427+
{
428+
l = (l << 8) | bytes[(k * i) + j];
429+
}
430+
return l;
431+
}
432+
433+
// n is the total nr of longs that corresponds also to the total number of BDD nodes needed
434+
int n = (bytes.Length - 1) / k;
435+
436+
// make sure the represented nr of longs divides precisely without remainder
437+
Debug.Assert((bytes.Length - 1) % k == 0);
438+
439+
// the number of bits used for ordinals and node identifiers are stored in the first two longs
440+
int ordinal_bits = (int)Get(0);
441+
int node_bits = (int)Get(1);
442+
443+
// create bit masks for the sizes of ordinals and node identifiers
444+
long ordinal_mask = (1 << ordinal_bits) - 1;
445+
long node_mask = (1 << node_bits) - 1;
446+
BitLayout(ordinal_bits, node_bits, out int zero_node_shift, out int one_node_shift, out int ordinal_shift);
447+
448+
// store BDD nodes by their id when they are created
449+
BDD[] nodes = new BDD[n];
450+
nodes[0] = False;
451+
nodes[1] = True;
452+
453+
for (int i = 2; i < n; i++)
454+
{
455+
// represents the triple (ordinal, one, zero)
456+
long arc = Get(i);
457+
458+
// reconstruct the ordinal and child identifiers for a non-terminal
459+
int ord = (int)((arc >> ordinal_shift) & ordinal_mask);
460+
int oneId = (int)((arc >> one_node_shift) & node_mask);
461+
int zeroId = (int)((arc >> zero_node_shift) & node_mask);
462+
463+
// the BDD nodes for the children are guaranteed to exist already
464+
// because of the topological order they were serialized by
465+
Debug.Assert(oneId < i && zeroId < i);
466+
nodes[i] = algebra.GetOrCreateBDD(ord, nodes[oneId], nodes[zeroId]);
467+
}
468+
469+
//the result is the last BDD in the nodes array
470+
return nodes[n - 1];
471+
}
472+
356473
/// <summary>
357474
/// Use this bit layout in the serialization
358475
/// </summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Algebras/CharSetSolver.cs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ namespace System.Text.RegularExpressions.Symbolic
1212
/// </summary>
1313
internal sealed class CharSetSolver : BDDAlgebra, ICharAlgebra<BDD>
1414
{
15-
/// <summary>BDDs for all characters for fast lookup.</summary>
16-
private readonly BDD[] _charPredTable = new BDD[char.MaxValue + 1];
15+
/// <summary>BDDs for all ASCII characters for fast lookup.</summary>
16+
private readonly BDD[] _charPredTable = new BDD[128];
1717
private readonly Unicode.IgnoreCaseTransformer _ignoreCase;
1818
internal readonly BDD _nonAscii;
1919

@@ -40,10 +40,23 @@ public BDD CharConstraint(char c, bool ignoreCase = false, string? culture = nul
4040
else
4141
{
4242
//individual character BDDs are always fixed
43-
return _charPredTable[c] ??= CreateSetFrom(c, 15);
43+
BDD[] charPredTable = _charPredTable;
44+
return c < charPredTable.Length ?
45+
charPredTable[c] ??= CreateBDDFromChar(c) :
46+
CreateBDDFromChar(c);
4447
}
4548
}
4649

50+
private BDD CreateBDDFromChar(ushort c)
51+
{
52+
BDD bdd = BDD.True;
53+
for (int k = 0; k < 16; k++)
54+
{
55+
bdd = (c & (1 << k)) == 0 ? GetOrCreateBDD(k, BDD.False, bdd) : GetOrCreateBDD(k, bdd, BDD.False);
56+
}
57+
return bdd;
58+
}
59+
4760
/// <summary>
4861
/// Make a CharSet from all the characters in the range from m to n.
4962
/// Returns the empty set if n is less than m

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/GeneratorHelper.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@ public static void WriteInt64ArrayInitSyntax(StreamWriter sw, long[] values)
1717
}
1818
sw.Write("}");
1919
}
20+
21+
public static void WriteByteArrayInitSyntax(StreamWriter sw, byte[] values)
22+
{
23+
sw.Write("new byte[] {");
24+
for (int i = 0; i < values.Length; i++)
25+
{
26+
sw.Write($" 0x{values[i]:X}, ");
27+
}
28+
sw.Write("}");
29+
}
2030
}
2131
#endif
2232
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelation.cs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseRelationGenerator.cs

Lines changed: 33 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -41,70 +41,54 @@ private static void WriteIgnoreCaseBDD(StreamWriter sw)
4141
sw.WriteLine(" /// <summary>Serialized BDD for mapping characters to their case-ignoring equivalence classes in the default (en-US) culture.</summary>");
4242

4343
var solver = new CharSetSolver();
44-
Dictionary<char, BDD> ignoreCase = ComputeIgnoreCaseDictionary(solver, new CultureInfo(DefaultCultureName));
44+
List<EquivalenceClass> ignoreCaseEquivalenceClasses = ComputeIgnoreCaseEquivalenceClasses(solver, new CultureInfo(DefaultCultureName));
4545
BDD ignorecase = solver.False;
46-
foreach (KeyValuePair<char, BDD> kv in ignoreCase)
46+
foreach (EquivalenceClass ec in ignoreCaseEquivalenceClasses)
4747
{
48-
BDD a = solver.CreateCharSetFromRange(kv.Key, kv.Key);
49-
BDD b = kv.Value;
50-
ignorecase = solver.Or(ignorecase, solver.And(solver.ShiftLeft(a, 16), b));
48+
// Create the Cartesian product of ec._set with itself
49+
BDD crossproduct = solver.And(solver.ShiftLeft(ec._set, 16), ec._set);
50+
// Add the product into the overall lookup table
51+
ignorecase = solver.Or(ignorecase, crossproduct);
5152
}
5253

53-
sw.Write(" public static readonly long[] IgnoreCaseEnUsSerializedBDD = ");
54-
GeneratorHelper.WriteInt64ArrayInitSyntax(sw, ignorecase.Serialize());
54+
sw.Write(" public static readonly byte[] IgnoreCaseEnUsSerializedBDD = ");
55+
GeneratorHelper.WriteByteArrayInitSyntax(sw, ignorecase.SerializeToBytes());
5556
sw.WriteLine(";");
5657
}
5758

58-
private static Dictionary<char, BDD> ComputeIgnoreCaseDictionary(CharSetSolver solver, CultureInfo culture)
59+
private static List<EquivalenceClass> ComputeIgnoreCaseEquivalenceClasses(CharSetSolver solver, CultureInfo culture)
5960
{
60-
CultureInfo originalCulture = CultureInfo.CurrentCulture;
61-
try
62-
{
63-
CultureInfo.CurrentCulture = culture;
61+
var ignoreCase = new Dictionary<char, EquivalenceClass>();
62+
var sets = new List<EquivalenceClass>();
6463

65-
var ignoreCase = new Dictionary<char, BDD>();
64+
for (uint i = 65; i <= 0xFFFF; i++)
65+
{
66+
char C = (char)i;
67+
char c = char.ToLower(C, culture);
6668

67-
for (uint i = 0; i <= 0xFFFF; i++)
69+
if (c == C)
6870
{
69-
char c = (char)i;
70-
char cUpper = char.ToUpper(c);
71-
char cLower = char.ToLower(c);
72-
73-
if (cUpper == cLower)
74-
{
75-
continue;
76-
}
77-
78-
// c may be different from both cUpper as well as cLower.
79-
// Make sure that the regex engine considers c as being equivalent to cUpper and cLower, else ignore c.
80-
// In some cases c != cU but the regex engine does not consider the chacarters equivalent wrt the ignore-case option.
81-
if (Regex.IsMatch($"{cUpper}{cLower}", $"^(?i:\\u{i:X4}\\u{i:X4})$"))
82-
{
83-
BDD equiv = solver.False;
84-
85-
if (ignoreCase.ContainsKey(c))
86-
equiv = solver.Or(equiv, ignoreCase[c]);
87-
88-
if (ignoreCase.ContainsKey(cUpper))
89-
equiv = solver.Or(equiv, ignoreCase[cUpper]);
90-
91-
if (ignoreCase.ContainsKey(cLower))
92-
equiv = solver.Or(equiv, ignoreCase[cLower]);
93-
94-
// Make sure all characters are included initially or when some is still missing
95-
equiv = solver.Or(equiv, solver.Or(solver.CreateCharSetFromRange(c, c), solver.Or(solver.CreateCharSetFromRange(cUpper, cUpper), solver.CreateCharSetFromRange(cLower, cLower))));
96-
97-
// Update all the members with their case-invariance equivalence classes
98-
foreach (char d in solver.GenerateAllCharacters(equiv))
99-
ignoreCase[d] = equiv;
100-
}
71+
continue;
10172
}
10273

103-
return ignoreCase;
74+
EquivalenceClass? ec;
75+
if (!ignoreCase.TryGetValue(c, out ec))
76+
{
77+
ec = new EquivalenceClass(solver.CharConstraint(c));
78+
ignoreCase[c] = ec;
79+
sets.Add(ec);
80+
}
81+
ec._set = solver.Or(ec._set, solver.CharConstraint(C));
10482
}
105-
finally
83+
return sets;
84+
}
85+
86+
private class EquivalenceClass
87+
{
88+
public BDD _set;
89+
public EquivalenceClass(BDD set)
10690
{
107-
CultureInfo.CurrentCulture = originalCulture;
91+
_set = set;
10892
}
10993
}
11094
};

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/Unicode/IgnoreCaseTransformer.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ private IgnoreCaseRelation EnsureDefault()
166166
if (_relationDefault is null)
167167
{
168168
BDD instance = BDD.Deserialize(Unicode.IgnoreCaseRelation.IgnoreCaseEnUsSerializedBDD, _solver);
169+
byte[] tmp = instance.SerializeToBytes();
170+
BDD instance2 = BDD.Deserialize(tmp, _solver);
171+
if (instance != instance2)
172+
throw new Exception();
169173
BDD instanceDomain = _solver.ShiftRight(instance, 16); // represents the set of all case-sensitive characters in the default culture.
170174
_relationDefault = new IgnoreCaseRelation(instance, instanceDomain);
171175
}

0 commit comments

Comments
 (0)