Skip to content

Commit d6d28e4

Browse files
authored
Implement Vector128 version of System.Buffers.Text.Base64 DecodeFromUtf8 and EncodeToUtf8 (#70654)
* Implement Vector128 version of System.Buffers.Text.Base64.DecodeFromUtf8 Rework the SS3 into a Vector128 version, and add Arm64 support. * SSE3 improvements * Remove superfluous bitwise And * Add comment to SimdShuffle * Inline SimdShuffle * Implement Vector128 version of System.Buffers.Text.Base64.EncodeToUtf8 * Ensure masking on SSE3 Change-Id: I319f94cfc51d0542ae4eb11a8d48b3eb8180553f CustomizedGitHooks: yes * Restore asserts and move zero inside the loop * Neater C# code Change-Id: I2cbe14f4228f8035e7d213b5b58815c4eee35563 CustomizedGitHooks: yes * Make SimdShuffle consistent across X64 and Arm64 * Better looking multiply
1 parent 18ec279 commit d6d28e4

File tree

2 files changed

+107
-78
lines changed

2 files changed

+107
-78
lines changed

src/libraries/System.Memory/src/System/Buffers/Text/Base64Decoder.cs

Lines changed: 64 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.InteropServices;
77
using System.Runtime.Intrinsics;
8+
using System.Runtime.Intrinsics.Arm;
89
using System.Runtime.Intrinsics.X86;
910

1011
namespace System.Buffers.Text
1112
{
1213
// AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2
13-
// SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
14+
// Vector128 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
1415

1516
public static partial class Base64
1617
{
@@ -74,9 +75,9 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Spa
7475
}
7576

7677
end = srcMax - 24;
77-
if (Ssse3.IsSupported && (end >= src))
78+
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src))
7879
{
79-
Ssse3Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
80+
Vector128Decode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
8081

8182
if (src == srcEnd)
8283
goto DoneExit;
@@ -476,10 +477,28 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b
476477
destBytes = dest;
477478
}
478479

480+
// This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented.
479481
[MethodImpl(MethodImplOptions.AggressiveInlining)]
480-
private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
482+
private static Vector128<byte> SimdShuffle(Vector128<byte> left, Vector128<byte> right, Vector128<byte> mask8F)
481483
{
482-
// If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
484+
Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian);
485+
486+
if (Ssse3.IsSupported)
487+
{
488+
return Ssse3.Shuffle(left, right);
489+
}
490+
else
491+
{
492+
return AdvSimd.Arm64.VectorTableLookup(left, Vector128.BitwiseAnd(right, mask8F));
493+
}
494+
}
495+
496+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
497+
private static unsafe void Vector128Decode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
498+
{
499+
Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian);
500+
501+
// If we have Vector128 support, pick off 16 bytes at a time for as long as we can,
483502
// but make sure that we quit before seeing any == markers at the end of the
484503
// string. Also, because we write four zeroes at the end of the output, ensure
485504
// that there are at least 6 valid bytes of input data remaining to close the
@@ -552,34 +571,15 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
552571
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
553572

554573
// The JIT won't hoist these "constants", so help it
555-
Vector128<sbyte> lutHi = Vector128.Create(
556-
0x10, 0x10, 0x01, 0x02,
557-
0x04, 0x08, 0x04, 0x08,
558-
0x10, 0x10, 0x10, 0x10,
559-
0x10, 0x10, 0x10, 0x10);
560-
561-
Vector128<sbyte> lutLo = Vector128.Create(
562-
0x15, 0x11, 0x11, 0x11,
563-
0x11, 0x11, 0x11, 0x11,
564-
0x11, 0x11, 0x13, 0x1A,
565-
0x1B, 0x1B, 0x1B, 0x1A);
566-
567-
Vector128<sbyte> lutShift = Vector128.Create(
568-
0, 16, 19, 4,
569-
-65, -65, -71, -71,
570-
0, 0, 0, 0,
571-
0, 0, 0, 0);
572-
573-
Vector128<sbyte> packBytesMask = Vector128.Create(
574-
2, 1, 0, 6,
575-
5, 4, 10, 9,
576-
8, 14, 13, 12,
577-
-1, -1, -1, -1);
578-
579-
Vector128<sbyte> mask2F = Vector128.Create((sbyte)'/');
580-
Vector128<sbyte> mergeConstant0 = Vector128.Create(0x01400140).AsSByte();
574+
Vector128<byte> lutHi = Vector128.Create(0x02011010, 0x08040804, 0x10101010, 0x10101010).AsByte();
575+
Vector128<byte> lutLo = Vector128.Create(0x11111115, 0x11111111, 0x1A131111, 0x1A1B1B1B).AsByte();
576+
Vector128<sbyte> lutShift = Vector128.Create(0x04131000, 0xb9b9bfbf, 0x00000000, 0x00000000).AsSByte();
577+
Vector128<sbyte> packBytesMask = Vector128.Create(0x06000102, 0x090A0405, 0x0C0D0E08, 0xffffffff).AsSByte();
578+
Vector128<byte> mergeConstant0 = Vector128.Create(0x01400140).AsByte();
581579
Vector128<short> mergeConstant1 = Vector128.Create(0x00011000).AsInt16();
582-
Vector128<sbyte> zero = Vector128<sbyte>.Zero;
580+
Vector128<byte> one = Vector128.Create((byte)1);
581+
Vector128<byte> mask2F = Vector128.Create((byte)'/');
582+
Vector128<byte> mask8F = Vector128.Create((byte)0x8F);
583583

584584
byte* src = srcBytes;
585585
byte* dest = destBytes;
@@ -588,52 +588,71 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
588588
do
589589
{
590590
AssertRead<Vector128<sbyte>>(src, srcStart, sourceLength);
591-
Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
591+
Vector128<byte> str = Vector128.LoadUnsafe(ref *src);
592592

593593
// lookup
594-
Vector128<sbyte> hiNibbles = Sse2.And(Sse2.ShiftRightLogical(str.AsInt32(), 4).AsSByte(), mask2F);
595-
Vector128<sbyte> loNibbles = Sse2.And(str, mask2F);
596-
Vector128<sbyte> hi = Ssse3.Shuffle(lutHi, hiNibbles);
597-
Vector128<sbyte> lo = Ssse3.Shuffle(lutLo, loNibbles);
594+
Vector128<byte> hiNibbles = Vector128.ShiftRightLogical(str.AsInt32(), 4).AsByte() & mask2F;
595+
Vector128<byte> hi = SimdShuffle(lutHi, hiNibbles, mask8F);
596+
Vector128<byte> lo = SimdShuffle(lutLo, str, mask8F);
598597

599598
// Check for invalid input: if any "and" values from lo and hi are not zero,
600599
// fall back on bytewise code to do error checking and reporting:
601-
if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.And(lo, hi), zero)) != 0)
600+
if ((lo & hi) != Vector128<byte>.Zero)
602601
break;
603602

604-
Vector128<sbyte> eq2F = Sse2.CompareEqual(str, mask2F);
605-
Vector128<sbyte> shift = Ssse3.Shuffle(lutShift, Sse2.Add(eq2F, hiNibbles));
603+
Vector128<byte> eq2F = Vector128.Equals(str, mask2F);
604+
Vector128<byte> shift = SimdShuffle(lutShift.AsByte(), (eq2F + hiNibbles), mask8F);
606605

607606
// Now simply add the delta values to the input:
608-
str = Sse2.Add(str, shift);
607+
str += shift;
609608

610609
// in, bits, upper case are most significant bits, lower case are least significant bits
611610
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
612611
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
613612
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
614613
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
615614

616-
Vector128<short> merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), mergeConstant0);
615+
Vector128<short> merge_ab_and_bc;
616+
if (Ssse3.IsSupported)
617+
{
618+
merge_ab_and_bc = Ssse3.MultiplyAddAdjacent(str.AsByte(), mergeConstant0.AsSByte());
619+
}
620+
else
621+
{
622+
Vector128<ushort> evens = AdvSimd.ShiftLeftLogicalWideningLower(AdvSimd.Arm64.UnzipEven(str, one).GetLower(), 6);
623+
Vector128<ushort> odds = AdvSimd.Arm64.TransposeOdd(str, Vector128<byte>.Zero).AsUInt16();
624+
merge_ab_and_bc = Vector128.Add(evens, odds).AsInt16();
625+
}
617626
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
618627
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
619628
// 0000eeee FFffffff 0000DDDD DDddEEEE
620629
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
621630

622-
Vector128<int> output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
631+
Vector128<int> output;
632+
if (Ssse3.IsSupported)
633+
{
634+
output = Sse2.MultiplyAddAdjacent(merge_ab_and_bc, mergeConstant1);
635+
}
636+
else
637+
{
638+
Vector128<int> ievens = AdvSimd.ShiftLeftLogicalWideningLower(AdvSimd.Arm64.UnzipEven(merge_ab_and_bc, one.AsInt16()).GetLower(), 12);
639+
Vector128<int> iodds = AdvSimd.Arm64.TransposeOdd(merge_ab_and_bc, Vector128<short>.Zero).AsInt32();
640+
output = Vector128.Add(ievens, iodds).AsInt32();
641+
}
623642
// 00000000 JJJJJJjj KKKKkkkk LLllllll
624643
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
625644
// 00000000 DDDDDDdd EEEEeeee FFffffff
626645
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
627646

628647
// Pack bytes together:
629-
str = Ssse3.Shuffle(output.AsSByte(), packBytesMask);
648+
str = SimdShuffle(output.AsByte(), packBytesMask.AsByte(), mask8F);
630649
// 00000000 00000000 00000000 00000000
631650
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
632651
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
633652
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
634653

635654
AssertWrite<Vector128<sbyte>>(dest, destStart, destLength);
636-
Sse2.Store(dest, str.AsByte());
655+
str.Store(dest);
637656

638657
src += 16;
639658
dest += 12;

src/libraries/System.Memory/src/System/Buffers/Text/Base64Encoder.cs

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
using System.Runtime.CompilerServices;
55
using System.Runtime.InteropServices;
66
using System.Runtime.Intrinsics;
7+
using System.Runtime.Intrinsics.Arm;
78
using System.Runtime.Intrinsics.X86;
89

910
namespace System.Buffers.Text
1011
{
1112
// AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2
12-
// SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
13+
// Vector128 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
1314

1415
/// <summary>
1516
/// Convert between binary data and UTF-8 encoded text that is represented in base 64.
@@ -75,9 +76,9 @@ public static unsafe OperationStatus EncodeToUtf8(ReadOnlySpan<byte> bytes, Span
7576
}
7677

7778
end = srcMax - 16;
78-
if (Ssse3.IsSupported && (end >= src))
79+
if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src))
7980
{
80-
Ssse3Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
81+
Vector128Encode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes);
8182

8283
if (src == srcEnd)
8384
goto DoneExit;
@@ -395,7 +396,7 @@ private static unsafe void Avx2Encode(ref byte* srcBytes, ref byte* destBytes, b
395396
}
396397

397398
[MethodImpl(MethodImplOptions.AggressiveInlining)]
398-
private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
399+
private static unsafe void Vector128Encode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart)
399400
{
400401
// If we have SSSE3 support, pick off 12 bytes at a time for as long as we can.
401402
// But because we read 16 bytes at a time, ensure we have enough room to do a
@@ -405,24 +406,15 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes,
405406
// 0 0 0 0 l k j i h g f e d c b a
406407

407408
// The JIT won't hoist these "constants", so help it
408-
Vector128<sbyte> shuffleVec = Vector128.Create(
409-
1, 0, 2, 1,
410-
4, 3, 5, 4,
411-
7, 6, 8, 7,
412-
10, 9, 11, 10);
413-
414-
Vector128<sbyte> lut = Vector128.Create(
415-
65, 71, -4, -4,
416-
-4, -4, -4, -4,
417-
-4, -4, -4, -4,
418-
-19, -16, 0, 0);
419-
420-
Vector128<sbyte> maskAC = Vector128.Create(0x0fc0fc00).AsSByte();
421-
Vector128<sbyte> maskBB = Vector128.Create(0x003f03f0).AsSByte();
409+
Vector128<byte> shuffleVec = Vector128.Create(0x01020001, 0x04050304, 0x07080607, 0x0A0B090A).AsByte();
410+
Vector128<byte> lut = Vector128.Create(0xFCFC4741, 0xFCFCFCFC, 0xFCFCFCFC, 0x0000F0ED).AsByte();
411+
Vector128<byte> maskAC = Vector128.Create(0x0fc0fc00).AsByte();
412+
Vector128<byte> maskBB = Vector128.Create(0x003f03f0).AsByte();
422413
Vector128<ushort> shiftAC = Vector128.Create(0x04000040).AsUInt16();
423-
Vector128<short> shiftBB = Vector128.Create(0x01000010).AsInt16();
424-
Vector128<byte> const51 = Vector128.Create((byte)51);
425-
Vector128<sbyte> const25 = Vector128.Create((sbyte)25);
414+
Vector128<short> shiftBB = Vector128.Create(0x01000010).AsInt16();
415+
Vector128<byte> const51 = Vector128.Create((byte)51);
416+
Vector128<sbyte> const25 = Vector128.Create((sbyte)25);
417+
Vector128<byte> mask8F = Vector128.Create((byte)0x8F);
426418

427419
byte* src = srcBytes;
428420
byte* dest = destBytes;
@@ -431,42 +423,52 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes,
431423
do
432424
{
433425
AssertRead<Vector128<sbyte>>(src, srcStart, sourceLength);
434-
Vector128<sbyte> str = Sse2.LoadVector128(src).AsSByte();
426+
Vector128<byte> str = Vector128.LoadUnsafe(ref *src);
435427

436428
// Reshuffle
437-
str = Ssse3.Shuffle(str, shuffleVec);
429+
str = SimdShuffle(str, shuffleVec, mask8F);
438430
// str, bytes MSB to LSB:
439431
// k l j k
440432
// h i g h
441433
// e f d e
442434
// b c a b
443435

444-
Vector128<sbyte> t0 = Sse2.And(str, maskAC);
436+
Vector128<byte> t0 = str & maskAC;
445437
// bits, upper case are most significant bits, lower case are least significant bits
446438
// 0000kkkk LL000000 JJJJJJ00 00000000
447439
// 0000hhhh II000000 GGGGGG00 00000000
448440
// 0000eeee FF000000 DDDDDD00 00000000
449441
// 0000bbbb CC000000 AAAAAA00 00000000
450442

451-
Vector128<sbyte> t2 = Sse2.And(str, maskBB);
443+
Vector128<byte> t2 = str & maskBB;
452444
// 00000000 00llllll 000000jj KKKK0000
453445
// 00000000 00iiiiii 000000gg HHHH0000
454446
// 00000000 00ffffff 000000dd EEEE0000
455447
// 00000000 00cccccc 000000aa BBBB0000
456448

457-
Vector128<ushort> t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shiftAC);
449+
Vector128<ushort> t1;
450+
if (Ssse3.IsSupported)
451+
{
452+
t1 = Sse2.MultiplyHigh(t0.AsUInt16(), shiftAC);
453+
}
454+
else
455+
{
456+
Vector128<ushort> odd = Vector128.ShiftRightLogical(AdvSimd.Arm64.UnzipOdd(t0.AsUInt16(), t0.AsUInt16()), 6);
457+
Vector128<ushort> even = Vector128.ShiftRightLogical(AdvSimd.Arm64.UnzipEven(t0.AsUInt16(), t0.AsUInt16()), 10);
458+
t1 = AdvSimd.Arm64.ZipLow(even, odd);
459+
}
458460
// 00000000 00kkkkLL 00000000 00JJJJJJ
459461
// 00000000 00hhhhII 00000000 00GGGGGG
460462
// 00000000 00eeeeFF 00000000 00DDDDDD
461463
// 00000000 00bbbbCC 00000000 00AAAAAA
462464

463-
Vector128<short> t3 = Sse2.MultiplyLow(t2.AsInt16(), shiftBB);
465+
Vector128<short> t3 = t2.AsInt16() * shiftBB;
464466
// 00llllll 00000000 00jjKKKK 00000000
465467
// 00iiiiii 00000000 00ggHHHH 00000000
466468
// 00ffffff 00000000 00ddEEEE 00000000
467469
// 00cccccc 00000000 00aaBBBB 00000000
468470

469-
str = Sse2.Or(t1.AsSByte(), t3.AsSByte());
471+
str = t1.AsByte() | t3.AsByte();
470472
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
471473
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
472474
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
@@ -484,19 +486,27 @@ private static unsafe void Ssse3Encode(ref byte* srcBytes, ref byte* destBytes,
484486

485487
// Create LUT indices from input:
486488
// the index for range #0 is right, others are 1 less than expected:
487-
Vector128<byte> indices = Sse2.SubtractSaturate(str.AsByte(), const51);
489+
Vector128<byte> indices;
490+
if (Ssse3.IsSupported)
491+
{
492+
indices = Sse2.SubtractSaturate(str.AsByte(), const51);
493+
}
494+
else
495+
{
496+
indices = AdvSimd.SubtractSaturate(str.AsByte(), const51);
497+
}
488498

489499
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
490-
Vector128<sbyte> mask = Sse2.CompareGreaterThan(str, const25);
500+
Vector128<sbyte> mask = Vector128.GreaterThan(str.AsSByte(), const25);
491501

492502
// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
493-
Vector128<sbyte> tmp = Sse2.Subtract(indices.AsSByte(), mask);
503+
Vector128<sbyte> tmp = indices.AsSByte() - mask;
494504

495505
// Add offsets to input values:
496-
str = Sse2.Add(str, Ssse3.Shuffle(lut, tmp));
506+
str += SimdShuffle(lut, tmp.AsByte(), mask8F);
497507

498508
AssertWrite<Vector128<sbyte>>(dest, destStart, destLength);
499-
Sse2.Store(dest, str.AsByte());
509+
str.Store(dest);
500510

501511
src += 12;
502512
dest += 16;

0 commit comments

Comments
 (0)