Skip to content

Commit fd688db

Browse files
Complete implementation and add tests/benchmark
1 parent 5e4f3ef commit fd688db

File tree

13 files changed

+18723
-605
lines changed

13 files changed

+18723
-605
lines changed

src/ImageSharp/Common/Helpers/Numerics.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,20 @@ public static Vector256<float> UnPremultiply(Vector256<float> source, Vector256<
643643
return Avx.Blend(result, alpha, BlendAlphaControl);
644644
}
645645

646+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
647+
public static Vector512<float> UnPremultiply(Vector512<float> source, Vector512<float> alpha)
648+
{
649+
// Check if alpha is zero to avoid division by zero
650+
Vector512<float> zeroMask = Vector512.Equals(alpha, Vector512<float>.Zero);
651+
652+
// Divide source by alpha if alpha is nonzero, otherwise set all components to match the source value
653+
Vector512<float> result = Vector512.ConditionalSelect(zeroMask, source, source / alpha);
654+
655+
// Blend the result with the alpha vector to ensure that the alpha component is unchanged
656+
Vector512<float> alphaMask = Vector512.Create(0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1).AsSingle();
657+
return Vector512.ConditionalSelect(alphaMask, alpha, result);
658+
}
659+
646660
/// <summary>
647661
/// Permutes the given vector return a new instance with all the values set to <see cref="Vector4.W"/>.
648662
/// </summary>

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -601,28 +601,6 @@ private static void Shuffle4Slice3(
601601
}
602602
}
603603

604-
/// <summary>
605-
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
606-
/// </summary>
607-
/// <remarks>ret = va - (vm0 * vm1)</remarks>
608-
/// <param name="va">The vector to add to the negated intermediate result.</param>
609-
/// <param name="vm0">The first vector to multiply.</param>
610-
/// <param name="vm1">The second vector to multiply.</param>
611-
/// <returns>The <see cref="Vector256{T}"/>.</returns>
612-
[MethodImpl(InliningOptions.ShortMethod)]
613-
public static Vector256<float> MultiplyAddNegated(
614-
Vector256<float> va,
615-
Vector256<float> vm0,
616-
Vector256<float> vm1)
617-
{
618-
if (Fma.IsSupported)
619-
{
620-
return Fma.MultiplyAddNegated(vm0, vm1, va);
621-
}
622-
623-
return Avx.Subtract(va, Avx.Multiply(vm0, vm1));
624-
}
625-
626604
/// <summary>
627605
/// Blend packed 8-bit integers from <paramref name="left"/> and <paramref name="right"/> using <paramref name="mask"/>.
628606
/// The high bit of each corresponding <paramref name="mask"/> byte determines the selection.

src/ImageSharp/Common/Helpers/Vector512Utilities.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,21 @@ public static Vector512<float> MultiplyAdd(
8787
Vector512<float> vm1)
8888
=> Avx512F.FusedMultiplyAdd(vm0, vm1, va);
8989

90+
/// <summary>
91+
/// Performs a multiplication and a negated addition of the <see cref="Vector512{Single}"/>.
92+
/// </summary>
93+
/// <remarks>ret = va - (vm0 * vm1)</remarks>
94+
/// <param name="va">The vector to add to the negated intermediate result.</param>
95+
/// <param name="vm0">The first vector to multiply.</param>
96+
/// <param name="vm1">The second vector to multiply.</param>
97+
/// <returns>The <see cref="Vector512{T}"/>.</returns>
98+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
99+
public static Vector512<float> MultiplyAddNegated(
100+
Vector512<float> va,
101+
Vector512<float> vm0,
102+
Vector512<float> vm1)
103+
=> Avx512F.FusedMultiplyAddNegated(vm0, vm1, va);
104+
90105
/// <summary>
91106
/// Restricts a vector between a minimum and a maximum value.
92107
/// </summary>

src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs

Lines changed: 16520 additions & 536 deletions
Large diffs are not rendered by default.

src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.tt

Lines changed: 152 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,34 @@ var blenders = new []{
8989
{
9090
amount = Numerics.Clamp(amount, 0, 1);
9191

92-
if (Avx2.IsSupported && destination.Length >= 2)
92+
if (Avx512F.IsSupported && destination.Length >= 4)
93+
{
94+
// Divide by 4 as 4 elements per Vector4 and 16 per Vector512<float>
95+
ref Vector512<float> destinationBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
96+
ref Vector512<float> destinationLast = ref Unsafe.Add(ref destinationBase, (uint)destination.Length / 4u);
97+
98+
ref Vector512<float> backgroundBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(background));
99+
ref Vector512<float> sourceBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(source));
100+
Vector512<float> opacity = Vector512.Create(amount);
101+
102+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
103+
{
104+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
105+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
106+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
107+
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
108+
}
109+
110+
int remainder = Numerics.Modulo4(destination.Length);
111+
if (remainder != 0)
112+
{
113+
for (int i = destination.Length - remainder; i < destination.Length; i++)
114+
{
115+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
116+
}
117+
}
118+
}
119+
else if (Avx2.IsSupported && destination.Length >= 2)
93120
{
94121
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
95122
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -128,7 +155,37 @@ var blenders = new []{
128155
{
129156
amount = Numerics.Clamp(amount, 0, 1);
130157

131-
if (Avx2.IsSupported && destination.Length >= 2)
158+
if (Avx512F.IsSupported && destination.Length >= 4)
159+
{
160+
// Divide by 4 as 4 elements per Vector4 and 16 per Vector512<float>
161+
ref Vector512<float> destinationBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
162+
ref Vector512<float> destinationLast = ref Unsafe.Add(ref destinationBase, (uint)destination.Length / 4u);
163+
164+
ref Vector512<float> backgroundBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(background));
165+
Vector512<float> sourceBase = Vector512.Create(
166+
source.X, source.Y, source.Z, source.W,
167+
source.X, source.Y, source.Z, source.W,
168+
source.X, source.Y, source.Z, source.W,
169+
source.X, source.Y, source.Z, source.W);
170+
Vector512<float> opacity = Vector512.Create(amount);
171+
172+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
173+
{
174+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
175+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
176+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
177+
}
178+
179+
int remainder = Numerics.Modulo4(destination.Length);
180+
if (remainder != 0)
181+
{
182+
for (int i = destination.Length - remainder; i < destination.Length; i++)
183+
{
184+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source, amount);
185+
}
186+
}
187+
}
188+
else if (Avx2.IsSupported && destination.Length >= 2)
132189
{
133190
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
134191
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -164,7 +221,51 @@ var blenders = new []{
164221
/// <inheritdoc />
165222
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, ReadOnlySpan<float> amount)
166223
{
167-
if (Avx2.IsSupported && destination.Length >= 2)
224+
if (Avx512F.IsSupported && destination.Length >= 4)
225+
{
226+
// Divide by 4 as 4 elements per Vector4 and 16 per Vector512<float>
227+
ref Vector512<float> destinationBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
228+
ref Vector512<float> destinationLast = ref Unsafe.Add(ref destinationBase, (uint)destination.Length / 4u);
229+
230+
ref Vector512<float> backgroundBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(background));
231+
ref Vector512<float> sourceBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(source));
232+
ref float amountBase = ref MemoryMarshal.GetReference(amount);
233+
234+
Vector512<float> vOne = Vector512.Create(1F);
235+
236+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
237+
{
238+
float amount0 = amountBase;
239+
float amount1 = Unsafe.Add(ref amountBase, 1);
240+
float amount2 = Unsafe.Add(ref amountBase, 2);
241+
float amount3 = Unsafe.Add(ref amountBase, 3);
242+
243+
// We need to create a Vector512<float> containing the current four amount values
244+
// taking up each quarter of the Vector512<float> and then clamp them.
245+
Vector512<float> opacity = Vector512.Create(
246+
amount0, amount0, amount0, amount0,
247+
amount1, amount1, amount1, amount1,
248+
amount2, amount2, amount2, amount2,
249+
amount3, amount3, amount3, amount3);
250+
opacity = Vector512.Min(Vector512.Max(Vector512<float>.Zero, opacity), vOne);
251+
252+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
253+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
254+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
255+
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
256+
amountBase = ref Unsafe.Add(ref amountBase, 4);
257+
}
258+
259+
int remainder = Numerics.Modulo4(destination.Length);
260+
if (remainder != 0)
261+
{
262+
for (int i = destination.Length - remainder; i < destination.Length; i++)
263+
{
264+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
265+
}
266+
}
267+
}
268+
else if (Avx2.IsSupported && destination.Length >= 2)
168269
{
169270
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
170271
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@@ -211,7 +312,54 @@ var blenders = new []{
211312
/// <inheritdoc />
212313
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, Vector4 source, ReadOnlySpan<float> amount)
213314
{
214-
if (Avx2.IsSupported && destination.Length >= 2)
315+
if (Avx512F.IsSupported && destination.Length >= 4)
316+
{
317+
// Divide by 4 as 4 elements per Vector4 and 16 per Vector512<float>
318+
ref Vector512<float> destinationBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
319+
ref Vector512<float> destinationLast = ref Unsafe.Add(ref destinationBase, (uint)destination.Length / 4u);
320+
321+
ref Vector512<float> backgroundBase = ref Unsafe.As<Vector4, Vector512<float>>(ref MemoryMarshal.GetReference(background));
322+
ref float amountBase = ref MemoryMarshal.GetReference(amount);
323+
324+
Vector512<float> sourceBase = Vector512.Create(
325+
source.X, source.Y, source.Z, source.W,
326+
source.X, source.Y, source.Z, source.W,
327+
source.X, source.Y, source.Z, source.W,
328+
source.X, source.Y, source.Z, source.W);
329+
Vector512<float> vOne = Vector512.Create(1F);
330+
331+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
332+
{
333+
float amount0 = amountBase;
334+
float amount1 = Unsafe.Add(ref amountBase, 1);
335+
float amount2 = Unsafe.Add(ref amountBase, 2);
336+
float amount3 = Unsafe.Add(ref amountBase, 3);
337+
338+
// We need to create a Vector512<float> containing the current four amount values
339+
// taking up each quarter of the Vector512<float> and then clamp them.
340+
Vector512<float> opacity = Vector512.Create(
341+
amount0, amount0, amount0, amount0,
342+
amount1, amount1, amount1, amount1,
343+
amount2, amount2, amount2, amount2,
344+
amount3, amount3, amount3, amount3);
345+
opacity = Vector512.Min(Vector512.Max(Vector512<float>.Zero, opacity), vOne);
346+
347+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
348+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
349+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
350+
amountBase = ref Unsafe.Add(ref amountBase, 4);
351+
}
352+
353+
int remainder = Numerics.Modulo4(destination.Length);
354+
if (remainder != 0)
355+
{
356+
for (int i = destination.Length - remainder; i < destination.Length; i++)
357+
{
358+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source, Numerics.Clamp(amount[i], 0, 1F));
359+
}
360+
}
361+
}
362+
else if (Avx2.IsSupported && destination.Length >= 2)
215363
{
216364
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
217365
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));

0 commit comments

Comments
 (0)