Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 54 additions & 28 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
internal static unsafe class LossyUtils
{
#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif

[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);

Expand Down Expand Up @@ -938,26 +942,58 @@ public static void HFilter8i(Span<byte> u, Span<byte> v, int offset, int stride,
FilterLoop24(v, offsetPlus4, 1, stride, 8, thresh, ithresh, hevThresh);
}

[MethodImpl(InliningOptions.ShortMethod)]
public static uint LoadUv(byte u, byte v) =>
(uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).

[MethodImpl(InliningOptions.ShortMethod)]
public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
public static void Mean16x4(Span<byte> input, Span<uint> dc, Span<ushort> tmp)
{
bgr[0] = (byte)YuvToB(y, u);
bgr[1] = (byte)YuvToG(y, u, v);
bgr[2] = (byte)YuvToR(y, v);
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
tmp.Clear();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really needed? We override the contents in the end.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i think you are right, its not needed

Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps, 16)));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 2, 16)));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(input.Slice(WebpConstants.Bps * 3, 16)));
Vector128<short> b0 = Sse2.ShiftRightLogical(a0.AsInt16(), 8); // hi byte
Vector128<short> b1 = Sse2.ShiftRightLogical(a1.AsInt16(), 8);
Vector128<short> b2 = Sse2.ShiftRightLogical(a2.AsInt16(), 8);
Vector128<short> b3 = Sse2.ShiftRightLogical(a3.AsInt16(), 8);
Vector128<byte> c0 = Sse2.And(a0, Mean16x4Mask); // lo byte
Vector128<byte> c1 = Sse2.And(a1, Mean16x4Mask);
Vector128<byte> c2 = Sse2.And(a2, Mean16x4Mask);
Vector128<byte> c3 = Sse2.And(a3, Mean16x4Mask);
Vector128<int> d0 = Sse2.Add(b0.AsInt32(), c0.AsInt32());
Vector128<int> d1 = Sse2.Add(b1.AsInt32(), c1.AsInt32());
Vector128<int> d2 = Sse2.Add(b2.AsInt32(), c2.AsInt32());
Vector128<int> d3 = Sse2.Add(b3.AsInt32(), c3.AsInt32());
Vector128<int> e0 = Sse2.Add(d0, d1);
Vector128<int> e1 = Sse2.Add(d2, d3);
Vector128<int> f0 = Sse2.Add(e0, e1);
ref ushort outputRef = ref MemoryMarshal.GetReference(tmp);
Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = f0.AsUInt16();

dc[0] = (uint)(tmp[1] + tmp[0]);
Copy link
Member

@JimBobSquarePants JimBobSquarePants Nov 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks to me like if you reverse these span assignments you'll cut out 9 of 12 bounds checks.

dc[1] = (uint)(tmp[3] + tmp[2]);
dc[2] = (uint)(tmp[5] + tmp[4]);
dc[3] = (uint)(tmp[7] + tmp[6]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as _mm_hadd_epi16 aka. Ssse3.HorizontalAdd?

I'm afraid 12 span indexer bound checks have measureable impact here. All of them seem unnecessary, since is tmp is always of 16 size and dc is always of 4 size. If we can't find any matching HorizontalAdd for this, maybe we should consider passing tmp as a pointer and and indexing dc with Unsafe.* stuff.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes it is the same as Ssse3.HorizontalAdd, good catch

}
else
#endif
{
for (int k = 0; k < 4; k++)
{
uint avg = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
avg += input[x + (y * WebpConstants.Bps)];
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
dc[k] = avg;
input = input.Slice(4); // go to next 4x4 block.
}
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static byte Avg2(byte a, byte b) => (byte)((a + b + 1) >> 1);
Expand Down Expand Up @@ -1163,9 +1199,6 @@ private static bool Hev(Span<byte> p, int offset, int step, int thresh)
return WebpLookupTables.Abs0(p1 - p0) > thresh || WebpLookupTables.Abs0(q1 - q0) > thresh;
}

[MethodImpl(InliningOptions.ShortMethod)]
private static int MultHi(int v, int coeff) => (v * coeff) >> 8;

[MethodImpl(InliningOptions.ShortMethod)]
private static void Store(Span<byte> dst, int x, int y, int v)
{
Expand All @@ -1188,13 +1221,6 @@ private static void Store2(Span<byte> dst, int y, int dc, int d, int c)
[MethodImpl(InliningOptions.ShortMethod)]
private static int Mul2(int a) => (a * 35468) >> 16;

[MethodImpl(InliningOptions.ShortMethod)]
private static byte Clip8(int v)
{
int yuvMask = (256 << 6) - 1;
return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void Put8x8uv(byte value, Span<byte> dst)
{
Expand Down
25 changes: 4 additions & 21 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -357,15 +357,16 @@ public int FastMbAnalyze(int quality)
int q = quality;
int kThreshold = 8 + ((17 - 8) * q / 100);
int k;
uint[] dc = new uint[16];
Span<uint> dc = stackalloc uint[16];
Span<ushort> tmp = stackalloc ushort[16];
uint m;
uint m2;
for (k = 0; k < 16; k += 4)
{
this.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.AsSpan(k));
LossyUtils.Mean16x4(this.YuvIn.AsSpan(YOffEnc + (k * WebpConstants.Bps)), dc.Slice(k, 4), tmp);
}

for (m = 0, m2 = 0, k = 0; k < 16; ++k)
for (m = 0, m2 = 0, k = 0; k < 16; k++)
{
m += dc[k];
m2 += dc[k] * dc[k];
Expand Down Expand Up @@ -823,24 +824,6 @@ public void BytesToNz()
this.Nz[this.nzIdx] = nz;
}

private void Mean16x4(Span<byte> input, Span<uint> dc)
{
for (int k = 0; k < 4; k++)
{
uint avg = 0;
for (int y = 0; y < 4; y++)
{
for (int x = 0; x < 4; x++)
{
avg += input[x + (y * WebpConstants.Bps)];
}
}

dc[k] = avg;
input = input.Slice(4); // go to next 4x4 block.
}
}

private void ImportBlock(Span<byte> src, int srcStride, Span<byte> dst, int w, int h, int size)
{
int dstIdx = 0;
Expand Down
24 changes: 12 additions & 12 deletions src/ImageSharp/Formats/Webp/Lossy/WebpLossyDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -747,21 +747,21 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
{
int xStep = 3;
int lastPixelPair = (len - 1) >> 1;
uint tluv = LossyUtils.LoadUv(topU[0], topV[0]); // top-left sample
uint luv = LossyUtils.LoadUv(curU[0], curV[0]); // left-sample
uint tluv = YuvConversion.LoadUv(topU[0], topV[0]); // top-left sample
uint luv = YuvConversion.LoadUv(curU[0], curV[0]); // left-sample
uint uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
LossyUtils.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);
YuvConversion.YuvToBgr(topY[0], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst);

if (bottomY != null)
{
uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
LossyUtils.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
YuvConversion.YuvToBgr(bottomY[0], (int)uv0 & 0xff, (int)(uv0 >> 16), bottomDst);
}

for (int x = 1; x <= lastPixelPair; x++)
{
uint tuv = LossyUtils.LoadUv(topU[x], topV[x]); // top sample
uint uv = LossyUtils.LoadUv(curU[x], curV[x]); // sample
uint tuv = YuvConversion.LoadUv(topU[x], topV[x]); // top sample
uint uv = YuvConversion.LoadUv(curU[x], curV[x]); // sample

// Precompute invariant values associated with first and second diagonals.
uint avg = tluv + tuv + luv + uv + 0x00080008u;
Expand All @@ -770,15 +770,15 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
uv0 = (diag12 + tluv) >> 1;
uint uv1 = (diag03 + tuv) >> 1;
int xMul2 = x * 2;
LossyUtils.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
LossyUtils.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));
YuvConversion.YuvToBgr(topY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((xMul2 - 1) * xStep));
YuvConversion.YuvToBgr(topY[xMul2 - 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), topDst.Slice((xMul2 - 0) * xStep));

if (bottomY != null)
{
uv0 = (diag03 + luv) >> 1;
uv1 = (diag12 + uv) >> 1;
LossyUtils.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
LossyUtils.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
YuvConversion.YuvToBgr(bottomY[xMul2 - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((xMul2 - 1) * xStep));
YuvConversion.YuvToBgr(bottomY[xMul2 + 0], (int)(uv1 & 0xff), (int)(uv1 >> 16), bottomDst.Slice((xMul2 + 0) * xStep));
}

tluv = tuv;
Expand All @@ -788,11 +788,11 @@ private void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span
if ((len & 1) == 0)
{
uv0 = ((3 * tluv) + luv + 0x00020002u) >> 2;
LossyUtils.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
YuvConversion.YuvToBgr(topY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), topDst.Slice((len - 1) * xStep));
if (bottomY != null)
{
uv0 = ((3 * luv) + tluv + 0x00020002u) >> 2;
LossyUtils.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
YuvConversion.YuvToBgr(bottomY[len - 1], (int)(uv0 & 0xff), (int)(uv0 >> 16), bottomDst.Slice((len - 1) * xStep));
}
}
}
Expand Down
31 changes: 31 additions & 0 deletions src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -299,5 +299,36 @@ private static int ClipUv(int uv, int rounding)
uv = (uv + rounding + (128 << (YuvFix + 2))) >> (YuvFix + 2);
return (uv & ~0xff) == 0 ? uv : uv < 0 ? 0 : 255;
}

[MethodImpl(InliningOptions.ShortMethod)]
public static uint LoadUv(byte u, byte v) =>
(uint)(u | (v << 16)); // We process u and v together stashed into 32bit(16bit each).

[MethodImpl(InliningOptions.ShortMethod)]
public static void YuvToBgr(int y, int u, int v, Span<byte> bgr)
{
bgr[0] = (byte)YuvToB(y, u);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverse these also.

bgr[1] = (byte)YuvToG(y, u, v);
bgr[2] = (byte)YuvToR(y, v);
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToG(int y, int u, int v) => Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);

[MethodImpl(InliningOptions.ShortMethod)]
public static int YuvToR(int y, int v) => Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);

[MethodImpl(InliningOptions.ShortMethod)]
private static int MultHi(int v, int coeff) => (v * coeff) >> 8;

[MethodImpl(InliningOptions.ShortMethod)]
private static byte Clip8(int v)
{
int yuvMask = (256 << 6) - 1;
return (byte)((v & ~yuvMask) == 0 ? v >> 6 : v < 0 ? 0 : 255);
}
}
}
35 changes: 34 additions & 1 deletion tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System.Linq;
using SixLabors.ImageSharp.Formats.Webp.Lossy;
using SixLabors.ImageSharp.Tests.TestUtilities;
using Xunit;
Expand All @@ -10,6 +11,30 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")]
public class LossyUtilsTests
{
private static void RunMean16x4Test()
{
// arrange
byte[] input =
{
154, 145, 102, 115, 127, 129, 126, 125, 126, 120, 133, 152, 157, 153, 119, 94, 104, 116, 111, 113,
113, 109, 105, 124, 173, 175, 177, 170, 175, 172, 166, 164, 151, 141, 99, 114, 125, 126, 135, 150,
133, 115, 127, 149, 141, 168, 100, 54, 110, 117, 115, 116, 119, 115, 117, 130, 174, 174, 174, 157,
146, 171, 166, 158, 117, 140, 96, 111, 119, 119, 136, 171, 188, 134, 121, 126, 136, 119, 59, 77,
109, 115, 113, 120, 120, 117, 128, 115, 174, 173, 173, 161, 152, 148, 153, 162, 105, 140, 96, 114,
115, 122, 141, 173, 190, 190, 142, 106, 151, 78, 66, 141, 110, 117, 123, 136, 118, 124, 127, 114,
173, 175, 166, 155, 155, 159, 159, 158
};
uint[] dc = new uint[4];
ushort[] tmp = new ushort[8];
uint[] expectedDc = { 1940, 2139, 2252, 1813 };

// act
LossyUtils.Mean16x4(input, dc, tmp);

// assert
Assert.True(dc.SequenceEqual(expectedDc));
}

private static void RunHadamardTransformTest()
{
byte[] a =
Expand Down Expand Up @@ -37,16 +62,24 @@ private static void RunHadamardTransformTest()
Assert.Equal(expected, actual);
}

[Fact]
public void Mean16x4_Works() => RunMean16x4Test();

[Fact]
public void HadamardTransform_Works() => RunHadamardTransformTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

[Fact]
public void Mean16x4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.DisableSSE2);

[Fact]
public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);

[Fact]
public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
#endif

}
}