-
-
Notifications
You must be signed in to change notification settings - Fork 887
Add SSE2 version of Mean16x4 #1814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
765f5a2
8b8871b
984971e
0c96e37
e8c0d2c
3c9c1bb
0ca9d43
9ab9e75
1418e53
9e143ef
3cfa040
84732bf
50013d7
f0cb89e
1452ba0
7d8225b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy | |
| { | ||
| internal static unsafe class LossyUtils | ||
| { | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte(); | ||
| #endif | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16); | ||
|
|
||
|
|
@@ -938,26 +942,58 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride, | |
| FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh); | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static uint LoadUv(byte u, byte v) => | ||
| (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static void YuvToBgr(int y, int u, int v, Span<byte> bgr) | ||
| public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp) | ||
| { | ||
| bgr[0] = (byte)YuvToB(y, u); | ||
| bgr[1] = (byte)YuvToG(y, u, v); | ||
| bgr[2] = (byte)YuvToR(y, v); | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| if (Sse2.IsSupported) | ||
| { | ||
| tmp.Clear(); | ||
| Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input)); | ||
| Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16))); | ||
| Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16))); | ||
| Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16))); | ||
| Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte | ||
| Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8); | ||
| Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8); | ||
| Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8); | ||
| Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte | ||
| Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask); | ||
| Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask); | ||
| Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask); | ||
| Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32()); | ||
| Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32()); | ||
| Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32()); | ||
| Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32()); | ||
| Vector128<int> e0 = Sse2.Add(d0, d1); | ||
| Vector128<int> e1 = Sse2.Add(d2, d3); | ||
| Vector128<int> f0 = Sse2.Add(e0, e1); | ||
| ref ushort outputRef = ref MemoryMarshal.GetReference(tmp); | ||
| Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = f0.AsUInt16(); | ||
|
|
||
| dc[0] = (uint)(tmp[1] + tmp[0]); | ||
|
||
| dc[1] = (uint)(tmp[3] + tmp[2]); | ||
| dc[2] = (uint)(tmp[5] + tmp[4]); | ||
| dc[3] = (uint)(tmp[7] + tmp[6]); | ||
|
||
| } | ||
| else | ||
| #endif | ||
| { | ||
| for (int k = 0; k < 4; k++) | ||
| { | ||
| uint avg = 0; | ||
| for (int y = 0; y < 4; y++) | ||
| { | ||
| for (int x = 0; x < 4; x++) | ||
| { | ||
| avg += input[x + (y * WebpConstants.Bps)]; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); | ||
| dc[k] = avg; | ||
| input = input.Slice(4); // go to next 4x4 block. | ||
| } | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1); | ||
|
|
@@ -1163,9 +1199,6 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh) | |
| return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh; | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static int MultHi(int v, int coeff) => (v * coeff) >> 8; | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static void Store(Span<byte> dst, int x, int y, int v) | ||
| { | ||
|
|
@@ -1188,13 +1221,6 @@ private static void Store2(Span<byte> dst, int y, int dc, int d, int c) | |
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static int Mul2(int a) => (a * 35468) >> 16; | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static byte Clip8(int v) | ||
| { | ||
| int yuvMask = (256 << 6) - 1; | ||
| return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static void Put8x8uv(byte value, Span<byte> dst) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -299,5 +299,36 @@ private static int ClipUv(int uv, int rounding) | |
| uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2); | ||
| return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255; | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static uint LoadUv(byte u, byte v) => | ||
| (uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each). | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static void YuvToBgr(int y, int u, int v, Span<byte> bgr) | ||
| { | ||
| bgr[0] = (byte)YuvToB(y, u); | ||
|
||
| bgr[1] = (byte)YuvToG(y, u, v); | ||
| bgr[2] = (byte)YuvToR(y, v); | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static int MultHi(int v, int coeff) => (v * coeff) >> 8; | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static byte Clip8(int v) | ||
| { | ||
| int yuvMask = (256 << 6) - 1; | ||
| return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really needed? We override the contents in the end.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah i think you are right, its not needed