55using System . Runtime . CompilerServices ;
66using System . Runtime . InteropServices ;
77using System . Runtime . Intrinsics ;
8+ using System . Runtime . Intrinsics . Arm ;
89using System . Runtime . Intrinsics . X86 ;
910
1011namespace System . Buffers . Text
1112{
1213 // AVX2 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/avx2
13- // SSSE3 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
14+ // Vector128 version based on https://github.com/aklomp/base64/tree/e516d769a2a432c08404f1981e73b431566057be/lib/arch/ssse3
1415
1516 public static partial class Base64
1617 {
@@ -74,9 +75,9 @@ public static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> utf8, Spa
7475 }
7576
7677 end = srcMax - 24 ;
77- if ( Ssse3 . IsSupported && ( end >= src ) )
78+ if ( ( Ssse3 . IsSupported || AdvSimd . Arm64 . IsSupported ) && BitConverter . IsLittleEndian && ( end >= src ) )
7879 {
79- Ssse3Decode ( ref src , ref dest , end , maxSrcLength , destLength , srcBytes , destBytes ) ;
80+ Vector128Decode ( ref src , ref dest , end , maxSrcLength , destLength , srcBytes , destBytes ) ;
8081
8182 if ( src == srcEnd )
8283 goto DoneExit ;
@@ -476,10 +477,28 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b
476477 destBytes = dest ;
477478 }
478479
480+ // This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented.
479481 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
480- private static unsafe void Ssse3Decode ( ref byte * srcBytes , ref byte * destBytes , byte * srcEnd , int sourceLength , int destLength , byte * srcStart , byte * destStart )
482+ private static Vector128 < byte > SimdShuffle ( Vector128 < byte > left , Vector128 < byte > right , Vector128 < byte > mask8F )
481483 {
482- // If we have SSSE3 support, pick off 16 bytes at a time for as long as we can,
484+ Debug . Assert ( ( Ssse3 . IsSupported || AdvSimd . Arm64 . IsSupported ) && BitConverter . IsLittleEndian ) ;
485+
486+ if ( Ssse3 . IsSupported )
487+ {
488+ return Ssse3 . Shuffle ( left , right ) ;
489+ }
490+ else
491+ {
492+ return AdvSimd . Arm64 . VectorTableLookup ( left , Vector128 . BitwiseAnd ( right , mask8F ) ) ;
493+ }
494+ }
495+
496+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
497+ private static unsafe void Vector128Decode ( ref byte * srcBytes , ref byte * destBytes , byte * srcEnd , int sourceLength , int destLength , byte * srcStart , byte * destStart )
498+ {
499+ Debug . Assert ( ( Ssse3 . IsSupported || AdvSimd . Arm64 . IsSupported ) && BitConverter . IsLittleEndian ) ;
500+
501+ // If we have Vector128 support, pick off 16 bytes at a time for as long as we can,
483502 // but make sure that we quit before seeing any == markers at the end of the
484503 // string. Also, because we write four zeroes at the end of the output, ensure
485504 // that there are at least 6 valid bytes of input data remaining to close the
@@ -552,34 +571,15 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
552571 // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
553572
554573 // The JIT won't hoist these "constants", so help it
555- Vector128 < sbyte > lutHi = Vector128 . Create (
556- 0x10 , 0x10 , 0x01 , 0x02 ,
557- 0x04 , 0x08 , 0x04 , 0x08 ,
558- 0x10 , 0x10 , 0x10 , 0x10 ,
559- 0x10 , 0x10 , 0x10 , 0x10 ) ;
560-
561- Vector128 < sbyte > lutLo = Vector128 . Create (
562- 0x15 , 0x11 , 0x11 , 0x11 ,
563- 0x11 , 0x11 , 0x11 , 0x11 ,
564- 0x11 , 0x11 , 0x13 , 0x1A ,
565- 0x1B , 0x1B , 0x1B , 0x1A ) ;
566-
567- Vector128 < sbyte > lutShift = Vector128 . Create (
568- 0 , 16 , 19 , 4 ,
569- - 65 , - 65 , - 71 , - 71 ,
570- 0 , 0 , 0 , 0 ,
571- 0 , 0 , 0 , 0 ) ;
572-
573- Vector128 < sbyte > packBytesMask = Vector128 . Create (
574- 2 , 1 , 0 , 6 ,
575- 5 , 4 , 10 , 9 ,
576- 8 , 14 , 13 , 12 ,
577- - 1 , - 1 , - 1 , - 1 ) ;
578-
579- Vector128 < sbyte > mask2F = Vector128 . Create ( ( sbyte ) '/' ) ;
580- Vector128 < sbyte > mergeConstant0 = Vector128 . Create ( 0x01400140 ) . AsSByte ( ) ;
574+ Vector128 < byte > lutHi = Vector128 . Create ( 0x02011010 , 0x08040804 , 0x10101010 , 0x10101010 ) . AsByte ( ) ;
575+ Vector128 < byte > lutLo = Vector128 . Create ( 0x11111115 , 0x11111111 , 0x1A131111 , 0x1A1B1B1B ) . AsByte ( ) ;
576+ Vector128 < sbyte > lutShift = Vector128 . Create ( 0x04131000 , 0xb9b9bfbf , 0x00000000 , 0x00000000 ) . AsSByte ( ) ;
577+ Vector128 < sbyte > packBytesMask = Vector128 . Create ( 0x06000102 , 0x090A0405 , 0x0C0D0E08 , 0xffffffff ) . AsSByte ( ) ;
578+ Vector128 < byte > mergeConstant0 = Vector128 . Create ( 0x01400140 ) . AsByte ( ) ;
581579 Vector128 < short > mergeConstant1 = Vector128 . Create ( 0x00011000 ) . AsInt16 ( ) ;
582- Vector128 < sbyte > zero = Vector128 < sbyte > . Zero ;
580+ Vector128 < byte > one = Vector128 . Create ( ( byte ) 1 ) ;
581+ Vector128 < byte > mask2F = Vector128 . Create ( ( byte ) '/' ) ;
582+ Vector128 < byte > mask8F = Vector128 . Create ( ( byte ) 0x8F ) ;
583583
584584 byte * src = srcBytes ;
585585 byte * dest = destBytes ;
@@ -588,52 +588,71 @@ private static unsafe void Ssse3Decode(ref byte* srcBytes, ref byte* destBytes,
588588 do
589589 {
590590 AssertRead < Vector128 < sbyte > > ( src , srcStart , sourceLength ) ;
591- Vector128 < sbyte > str = Sse2 . LoadVector128 ( src ) . AsSByte ( ) ;
591+ Vector128 < byte > str = Vector128 . LoadUnsafe ( ref * src ) ;
592592
593593 // lookup
594- Vector128 < sbyte > hiNibbles = Sse2 . And ( Sse2 . ShiftRightLogical ( str . AsInt32 ( ) , 4 ) . AsSByte ( ) , mask2F ) ;
595- Vector128 < sbyte > loNibbles = Sse2 . And ( str , mask2F ) ;
596- Vector128 < sbyte > hi = Ssse3 . Shuffle ( lutHi , hiNibbles ) ;
597- Vector128 < sbyte > lo = Ssse3 . Shuffle ( lutLo , loNibbles ) ;
594+ Vector128 < byte > hiNibbles = Vector128 . ShiftRightLogical ( str . AsInt32 ( ) , 4 ) . AsByte ( ) & mask2F ;
595+ Vector128 < byte > hi = SimdShuffle ( lutHi , hiNibbles , mask8F ) ;
596+ Vector128 < byte > lo = SimdShuffle ( lutLo , str , mask8F ) ;
598597
599598 // Check for invalid input: if any "and" values from lo and hi are not zero,
600599 // fall back on bytewise code to do error checking and reporting:
601- if ( Sse2 . MoveMask ( Sse2 . CompareGreaterThan ( Sse2 . And ( lo , hi ) , zero ) ) != 0 )
600+ if ( ( lo & hi ) != Vector128 < byte > . Zero )
602601 break ;
603602
604- Vector128 < sbyte > eq2F = Sse2 . CompareEqual ( str , mask2F ) ;
605- Vector128 < sbyte > shift = Ssse3 . Shuffle ( lutShift , Sse2 . Add ( eq2F , hiNibbles ) ) ;
603+ Vector128 < byte > eq2F = Vector128 . Equals ( str , mask2F ) ;
604+ Vector128 < byte > shift = SimdShuffle ( lutShift . AsByte ( ) , ( eq2F + hiNibbles ) , mask8F ) ;
606605
607606 // Now simply add the delta values to the input:
608- str = Sse2 . Add ( str , shift ) ;
607+ str += shift ;
609608
610609 // in, bits, upper case are most significant bits, lower case are least significant bits
611610 // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
612611 // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
613612 // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
614613 // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
615614
616- Vector128 < short > merge_ab_and_bc = Ssse3 . MultiplyAddAdjacent ( str . AsByte ( ) , mergeConstant0 ) ;
615+ Vector128 < short > merge_ab_and_bc ;
616+ if ( Ssse3 . IsSupported )
617+ {
618+ merge_ab_and_bc = Ssse3 . MultiplyAddAdjacent ( str . AsByte ( ) , mergeConstant0 . AsSByte ( ) ) ;
619+ }
620+ else
621+ {
622+ Vector128 < ushort > evens = AdvSimd . ShiftLeftLogicalWideningLower ( AdvSimd . Arm64 . UnzipEven ( str , one ) . GetLower ( ) , 6 ) ;
623+ Vector128 < ushort > odds = AdvSimd . Arm64 . TransposeOdd ( str , Vector128 < byte > . Zero ) . AsUInt16 ( ) ;
624+ merge_ab_and_bc = Vector128 . Add ( evens , odds ) . AsInt16 ( ) ;
625+ }
617626 // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
618627 // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
619628 // 0000eeee FFffffff 0000DDDD DDddEEEE
620629 // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
621630
622- Vector128 < int > output = Sse2 . MultiplyAddAdjacent ( merge_ab_and_bc , mergeConstant1 ) ;
631+ Vector128 < int > output ;
632+ if ( Ssse3 . IsSupported )
633+ {
634+ output = Sse2 . MultiplyAddAdjacent ( merge_ab_and_bc , mergeConstant1 ) ;
635+ }
636+ else
637+ {
638+ Vector128 < int > ievens = AdvSimd . ShiftLeftLogicalWideningLower ( AdvSimd . Arm64 . UnzipEven ( merge_ab_and_bc , one . AsInt16 ( ) ) . GetLower ( ) , 12 ) ;
639+ Vector128 < int > iodds = AdvSimd . Arm64 . TransposeOdd ( merge_ab_and_bc , Vector128 < short > . Zero ) . AsInt32 ( ) ;
640+ output = Vector128 . Add ( ievens , iodds ) . AsInt32 ( ) ;
641+ }
623642 // 00000000 JJJJJJjj KKKKkkkk LLllllll
624643 // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
625644 // 00000000 DDDDDDdd EEEEeeee FFffffff
626645 // 00000000 AAAAAAaa BBBBbbbb CCcccccc
627646
628647 // Pack bytes together:
629- str = Ssse3 . Shuffle ( output . AsSByte ( ) , packBytesMask ) ;
648+ str = SimdShuffle ( output . AsByte ( ) , packBytesMask . AsByte ( ) , mask8F ) ;
630649 // 00000000 00000000 00000000 00000000
631650 // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
632651 // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
633652 // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
634653
635654 AssertWrite < Vector128 < sbyte > > ( dest , destStart , destLength ) ;
636- Sse2 . Store ( dest , str . AsByte ( ) ) ;
655+ str . Store ( dest ) ;
637656
638657 src += 16 ;
639658 dest += 12 ;
0 commit comments