@@ -564,6 +564,7 @@ public static void PredictorInverseTransform(
564564 int mask = tileWidth - 1 ;
565565 int tilesPerRow = SubSampleSize ( width , transform . Bits ) ;
566566 int predictorModeIdxBase = ( y >> transform . Bits ) * tilesPerRow ;
567+ Span < short > scratch = stackalloc short [ 8 ] ;
567568 while ( y < yEnd )
568569 {
569570 int predictorModeIdx = predictorModeIdxBase ;
@@ -621,7 +622,7 @@ public static void PredictorInverseTransform(
621622 PredictorAdd10 ( input + x , output + x - width , xEnd - x , output + x ) ;
622623 break ;
623624 case 11 :
624- PredictorAdd11 ( input + x , output + x - width , xEnd - x , output + x ) ;
625+ PredictorAdd11 ( input + x , output + x - width , xEnd - x , output + x , scratch ) ;
625626 break ;
626627 case 12 :
627628 PredictorAdd12 ( input + x , output + x - width , xEnd - x , output + x ) ;
@@ -987,11 +988,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
987988 }
988989
989990 [ MethodImpl ( InliningOptions . ShortMethod ) ]
990- private static void PredictorAdd11 ( uint * input , uint * upper , int numberOfPixels , uint * output )
991+ private static void PredictorAdd11 ( uint * input , uint * upper , int numberOfPixels , uint * output , Span < short > scratch )
991992 {
992993 for ( int x = 0 ; x < numberOfPixels ; x ++ )
993994 {
994- uint pred = Predictor11 ( output [ x - 1 ] , upper + x ) ;
995+ uint pred = Predictor11 ( output [ x - 1 ] , upper + x , scratch ) ;
995996 output [ x ] = AddPixels ( input [ x ] , pred ) ;
996997 }
997998 }
@@ -1044,7 +1045,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
10441045 public static uint Predictor10 ( uint left , uint * top ) => Average4 ( left , top [ - 1 ] , top [ 0 ] , top [ 1 ] ) ;
10451046
10461047 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1047- public static uint Predictor11 ( uint left , uint * top ) => Select ( top [ 0 ] , left , top [ - 1 ] ) ;
1048+ public static uint Predictor11 ( uint left , uint * top , Span < short > scratch ) => Select ( top [ 0 ] , left , top [ - 1 ] , scratch ) ;
10481049
10491050 [ MethodImpl ( InliningOptions . ShortMethod ) ]
10501051 public static uint Predictor12 ( uint left , uint * top ) => ClampedAddSubtractFull ( left , top [ 0 ] , top [ - 1 ] ) ;
@@ -1161,11 +1162,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
11611162 }
11621163
11631164 [ MethodImpl ( InliningOptions . ShortMethod ) ]
1164- public static void PredictorSub11 ( uint * input , uint * upper , int numPixels , uint * output )
1165+ public static void PredictorSub11 ( uint * input , uint * upper , int numPixels , uint * output , Span < short > scratch )
11651166 {
11661167 for ( int x = 0 ; x < numPixels ; x ++ )
11671168 {
1168- uint pred = Predictor11 ( input [ x - 1 ] , upper + x ) ;
1169+ uint pred = Predictor11 ( input [ x - 1 ] , upper + x , scratch ) ;
11691170 output [ x ] = SubPixels ( input [ x ] , pred ) ;
11701171 }
11711172 }
@@ -1253,14 +1254,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
12531254 private static Vector128 < int > MkCst16 ( int hi , int lo ) => Vector128 . Create ( ( hi << 16 ) | ( lo & 0xffff ) ) ;
12541255#endif
12551256
1256- private static uint Select ( uint a , uint b , uint c )
1257+ private static uint Select ( uint a , uint b , uint c , Span < short > scratch )
12571258 {
1258- int paMinusPb =
1259- Sub3 ( ( int ) ( a >> 24 ) , ( int ) ( b >> 24 ) , ( int ) ( c >> 24 ) ) +
1260- Sub3 ( ( int ) ( ( a >> 16 ) & 0xff ) , ( int ) ( ( b >> 16 ) & 0xff ) , ( int ) ( ( c >> 16 ) & 0xff ) ) +
1261- Sub3 ( ( int ) ( ( a >> 8 ) & 0xff ) , ( int ) ( ( b >> 8 ) & 0xff ) , ( int ) ( ( c >> 8 ) & 0xff ) ) +
1262- Sub3 ( ( int ) ( a & 0xff ) , ( int ) ( b & 0xff ) , ( int ) ( c & 0xff ) ) ;
1263- return paMinusPb <= 0 ? a : b ;
1259+ #if SUPPORTS_RUNTIME_INTRINSICS
1260+ if ( Sse2 . IsSupported )
1261+ {
1262+ Span < short > output = scratch ;
1263+ fixed ( short * p = output )
1264+ {
1265+ Vector128 < byte > a0 = Sse2 . ConvertScalarToVector128UInt32 ( a ) . AsByte ( ) ;
1266+ Vector128 < byte > b0 = Sse2 . ConvertScalarToVector128UInt32 ( b ) . AsByte ( ) ;
1267+ Vector128 < byte > c0 = Sse2 . ConvertScalarToVector128UInt32 ( c ) . AsByte ( ) ;
1268+ Vector128 < byte > ac0 = Sse2 . SubtractSaturate ( a0 , c0 ) ;
1269+ Vector128 < byte > ca0 = Sse2 . SubtractSaturate ( c0 , a0 ) ;
1270+ Vector128 < byte > bc0 = Sse2 . SubtractSaturate ( b0 , c0 ) ;
1271+ Vector128 < byte > cb0 = Sse2 . SubtractSaturate ( c0 , b0 ) ;
1272+ Vector128 < byte > ac = Sse2 . Or ( ac0 , ca0 ) ;
1273+ Vector128 < byte > bc = Sse2 . Or ( bc0 , cb0 ) ;
1274+ Vector128 < byte > pa = Sse2 . UnpackLow ( ac , Vector128 < byte > . Zero ) ; // |a - c|
1275+ Vector128 < byte > pb = Sse2 . UnpackLow ( bc , Vector128 < byte > . Zero ) ; // |b - c|
1276+ Vector128 < ushort > diff = Sse2 . Subtract ( pb . AsUInt16 ( ) , pa . AsUInt16 ( ) ) ;
1277+ Sse2 . Store ( ( ushort * ) p , diff ) ;
1278+ }
1279+
1280+ int paMinusPb = output [ 0 ] + output [ 1 ] + output [ 2 ] + output [ 3 ] ;
1281+
1282+ return ( paMinusPb <= 0 ) ? a : b ;
1283+ }
1284+ else
1285+ #endif
1286+ {
1287+ int paMinusPb =
1288+ Sub3 ( ( int ) ( a >> 24 ) , ( int ) ( b >> 24 ) , ( int ) ( c >> 24 ) ) +
1289+ Sub3 ( ( int ) ( ( a >> 16 ) & 0xff ) , ( int ) ( ( b >> 16 ) & 0xff ) , ( int ) ( ( c >> 16 ) & 0xff ) ) +
1290+ Sub3 ( ( int ) ( ( a >> 8 ) & 0xff ) , ( int ) ( ( b >> 8 ) & 0xff ) , ( int ) ( ( c >> 8 ) & 0xff ) ) +
1291+ Sub3 ( ( int ) ( a & 0xff ) , ( int ) ( b & 0xff ) , ( int ) ( c & 0xff ) ) ;
1292+ return paMinusPb <= 0 ? a : b ;
1293+ }
12641294 }
12651295
12661296 [ MethodImpl ( InliningOptions . ShortMethod ) ]
0 commit comments