Skip to content

Commit a9c43dc

Browse files
Convolution Replace parallel row iteration with sequential loops
1 parent 25b3567 commit a9c43dc

File tree

29 files changed

+164
-123
lines changed

29 files changed

+164
-123
lines changed

src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs

Lines changed: 60 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Buffers;
45
using System.Numerics;
56
using System.Runtime.CompilerServices;
67
using System.Runtime.InteropServices;
78
using SixLabors.ImageSharp.Advanced;
9+
using SixLabors.ImageSharp.ColorProfiles.Companding;
810
using SixLabors.ImageSharp.Memory;
911
using SixLabors.ImageSharp.PixelFormats;
1012
using SixLabors.ImageSharp.Processing.Processors.Convolution.Parameters;
@@ -77,22 +79,36 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
7779
{
7880
Rectangle sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds);
7981

82+
MemoryAllocator allocator = this.Configuration.MemoryAllocator;
83+
84+
// Convolution is memory-bandwidth-bound with low arithmetic intensity.
85+
// Parallelization degrades performance due to cache line contention from
86+
// overlapping source row reads. See #3111.
87+
8088
// Preliminary gamma highlight pass
8189
if (this.gamma == 3F)
8290
{
8391
ApplyGamma3ExposureRowOperation gammaOperation = new(sourceRectangle, source.PixelBuffer, this.Configuration);
84-
ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
85-
this.Configuration,
86-
sourceRectangle,
87-
in gammaOperation);
92+
93+
using IMemoryOwner<Vector4> gammaBuffer = allocator.Allocate<Vector4>(gammaOperation.GetRequiredBufferLength(sourceRectangle));
94+
Span<Vector4> gammaSpan = gammaBuffer.Memory.Span;
95+
96+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
97+
{
98+
gammaOperation.Invoke(y, gammaSpan);
99+
}
88100
}
89101
else
90102
{
91103
ApplyGammaExposureRowOperation gammaOperation = new(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
92-
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
93-
this.Configuration,
94-
sourceRectangle,
95-
in gammaOperation);
104+
105+
using IMemoryOwner<Vector4> gammaBuffer = allocator.Allocate<Vector4>(gammaOperation.GetRequiredBufferLength(sourceRectangle));
106+
Span<Vector4> gammaSpan = gammaBuffer.Memory.Span;
107+
108+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
109+
{
110+
gammaOperation.Invoke(y, gammaSpan);
111+
}
96112
}
97113

98114
// Create a 0-filled buffer to use to store the result of the component convolutions
@@ -105,18 +121,20 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
105121
if (this.gamma == 3F)
106122
{
107123
ApplyInverseGamma3ExposureRowOperation operation = new(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
108-
ParallelRowIterator.IterateRows(
109-
this.Configuration,
110-
sourceRectangle,
111-
in operation);
124+
125+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
126+
{
127+
operation.Invoke(y);
128+
}
112129
}
113130
else
114131
{
115-
ApplyInverseGammaExposureRowOperation operation = new(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
116-
ParallelRowIterator.IterateRows(
117-
this.Configuration,
118-
sourceRectangle,
119-
in operation);
132+
ApplyInverseGammaExposureRowOperation operation = new(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, this.gamma);
133+
134+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
135+
{
136+
operation.Invoke(y);
137+
}
120138
}
121139
}
122140

@@ -169,10 +187,15 @@ private void OnFrameApplyCore(
169187
kernel,
170188
configuration);
171189

172-
ParallelRowIterator.IterateRows<FirstPassConvolutionRowOperation, Vector4>(
173-
configuration,
174-
sourceRectangle,
175-
in horizontalOperation);
190+
using (IMemoryOwner<Vector4> hBuffer = configuration.MemoryAllocator.Allocate<Vector4>(horizontalOperation.GetRequiredBufferLength(sourceRectangle)))
191+
{
192+
Span<Vector4> hSpan = hBuffer.Memory.Span;
193+
194+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
195+
{
196+
horizontalOperation.Invoke(y, hSpan);
197+
}
198+
}
176199

177200
// Vertical 1D convolutions to accumulate the partial results on the target buffer
178201
BokehBlurProcessor.SecondPassConvolutionRowOperation verticalOperation = new(
@@ -184,10 +207,10 @@ private void OnFrameApplyCore(
184207
parameters.Z,
185208
parameters.W);
186209

187-
ParallelRowIterator.IterateRows(
188-
configuration,
189-
sourceRectangle,
190-
in verticalOperation);
210+
for (int y = sourceRectangle.Top; y < sourceRectangle.Bottom; y++)
211+
{
212+
verticalOperation.Invoke(y);
213+
}
191214
}
192215
}
193216

@@ -305,15 +328,9 @@ public void Invoke(int y, Span<Vector4> span)
305328
{
306329
Span<TPixel> targetRowSpan = this.targetPixels.DangerousGetRowSpan(y)[this.bounds.X..];
307330
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan[..span.Length], span, PixelConversionModifiers.Premultiply);
308-
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
309331

310-
for (int x = 0; x < this.bounds.Width; x++)
311-
{
312-
ref Vector4 v = ref Unsafe.Add(ref baseRef, (uint)x);
313-
v.X = MathF.Pow(v.X, this.gamma);
314-
v.Y = MathF.Pow(v.Y, this.gamma);
315-
v.Z = MathF.Pow(v.Z, this.gamma);
316-
}
332+
// Input is premultiplied [0,1] so the LUT is safe here.
333+
GammaCompanding.Expand(span[..this.bounds.Width], this.gamma);
317334

318335
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
319336
}
@@ -367,44 +384,34 @@ public void Invoke(int y, Span<Vector4> span)
367384
private readonly Buffer2D<TPixel> targetPixels;
368385
private readonly Buffer2D<Vector4> sourceValues;
369386
private readonly Configuration configuration;
370-
private readonly float inverseGamma;
387+
private readonly float gamma;
371388

372389
[MethodImpl(InliningOptions.ShortMethod)]
373390
public ApplyInverseGammaExposureRowOperation(
374391
Rectangle bounds,
375392
Buffer2D<TPixel> targetPixels,
376393
Buffer2D<Vector4> sourceValues,
377394
Configuration configuration,
378-
float inverseGamma)
395+
float gamma)
379396
{
380397
this.bounds = bounds;
381398
this.targetPixels = targetPixels;
382399
this.sourceValues = sourceValues;
383400
this.configuration = configuration;
384-
this.inverseGamma = inverseGamma;
401+
this.gamma = gamma;
385402
}
386403

387404
/// <inheritdoc/>
388405
[MethodImpl(InliningOptions.ShortMethod)]
389406
public void Invoke(int y)
390407
{
391-
Vector4 low = Vector4.Zero;
392-
Vector4 high = new(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
393-
394408
Span<TPixel> targetPixelSpan = this.targetPixels.DangerousGetRowSpan(y)[this.bounds.X..];
395-
Span<Vector4> sourceRowSpan = this.sourceValues.DangerousGetRowSpan(y)[this.bounds.X..];
396-
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
409+
Span<Vector4> sourceRowSpan = this.sourceValues.DangerousGetRowSpan(y).Slice(this.bounds.X, this.bounds.Width);
397410

398-
for (int x = 0; x < this.bounds.Width; x++)
399-
{
400-
ref Vector4 v = ref Unsafe.Add(ref sourceRef, (uint)x);
401-
Vector4 clamp = Numerics.Clamp(v, low, high);
402-
v.X = MathF.Pow(clamp.X, this.inverseGamma);
403-
v.Y = MathF.Pow(clamp.Y, this.inverseGamma);
404-
v.Z = MathF.Pow(clamp.Z, this.inverseGamma);
405-
}
411+
Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, 1F);
412+
GammaCompanding.Compress(sourceRowSpan, this.gamma);
406413

407-
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan[..this.bounds.Width], targetPixelSpan, PixelConversionModifiers.Premultiply);
414+
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan, targetPixelSpan, PixelConversionModifiers.Premultiply);
408415
}
409416
}
410417

@@ -433,17 +440,16 @@ public ApplyInverseGamma3ExposureRowOperation(
433440

434441
/// <inheritdoc/>
435442
[MethodImpl(InliningOptions.ShortMethod)]
436-
public unsafe void Invoke(int y)
443+
public void Invoke(int y)
437444
{
438445
Span<Vector4> sourceRowSpan = this.sourceValues.DangerousGetRowSpan(y).Slice(this.bounds.X, this.bounds.Width);
439-
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
440446

441-
Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, float.PositiveInfinity);
447+
Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, 1F);
442448
Numerics.CubeRootOnXYZ(sourceRowSpan);
443449

444450
Span<TPixel> targetPixelSpan = this.targetPixels.DangerousGetRowSpan(y)[this.bounds.X..];
445451

446-
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan[..this.bounds.Width], targetPixelSpan, PixelConversionModifiers.Premultiply);
452+
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan, targetPixelSpan, PixelConversionModifiers.Premultiply);
447453
}
448454
}
449455
}

src/ImageSharp/Processing/Processors/Convolution/Convolution2DProcessor{TPixel}.cs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Buffers;
45
using System.Numerics;
5-
using SixLabors.ImageSharp.Advanced;
66
using SixLabors.ImageSharp.Memory;
77
using SixLabors.ImageSharp.PixelFormats;
88

@@ -79,10 +79,16 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
7979
this.Configuration,
8080
this.PreserveAlpha);
8181

82-
ParallelRowIterator.IterateRows<Convolution2DRowOperation<TPixel>, Vector4>(
83-
this.Configuration,
84-
interest,
85-
in operation);
82+
// Convolution is memory-bandwidth-bound with low arithmetic intensity.
83+
// Parallelization degrades performance due to cache line contention from
84+
// overlapping source row reads. See #3111.
85+
using IMemoryOwner<Vector4> buffer = allocator.Allocate<Vector4>(operation.GetRequiredBufferLength(interest));
86+
Span<Vector4> span = buffer.Memory.Span;
87+
88+
for (int y = interest.Top; y < interest.Bottom; y++)
89+
{
90+
operation.Invoke(y, span);
91+
}
8692
}
8793

8894
Buffer2D<TPixel>.SwapOrCopyContent(source.PixelBuffer, targetPixels);

src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Buffers;
45
using System.Numerics;
56
using System.Runtime.CompilerServices;
67
using System.Runtime.InteropServices;
@@ -106,6 +107,12 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
106107

107108
mapXY.BuildSamplingOffsetMap(this.KernelX.Length, this.KernelX.Length, interest, this.BorderWrapModeX, this.BorderWrapModeY);
108109

110+
MemoryAllocator allocator = this.Configuration.MemoryAllocator;
111+
112+
// Convolution is memory-bandwidth-bound with low arithmetic intensity.
113+
// Parallelization degrades performance due to cache line contention from
114+
// overlapping source row reads. See #3111.
115+
109116
// Horizontal convolution
110117
HorizontalConvolutionRowOperation horizontalOperation = new(
111118
interest,
@@ -116,10 +123,15 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
116123
this.Configuration,
117124
this.PreserveAlpha);
118125

119-
ParallelRowIterator.IterateRows<HorizontalConvolutionRowOperation, Vector4>(
120-
this.Configuration,
121-
interest,
122-
in horizontalOperation);
126+
using (IMemoryOwner<Vector4> hBuffer = allocator.Allocate<Vector4>(horizontalOperation.GetRequiredBufferLength(interest)))
127+
{
128+
Span<Vector4> hSpan = hBuffer.Memory.Span;
129+
130+
for (int y = interest.Top; y < interest.Bottom; y++)
131+
{
132+
horizontalOperation.Invoke(y, hSpan);
133+
}
134+
}
123135

124136
// Vertical convolution
125137
VerticalConvolutionRowOperation verticalOperation = new(
@@ -131,10 +143,15 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
131143
this.Configuration,
132144
this.PreserveAlpha);
133145

134-
ParallelRowIterator.IterateRows<VerticalConvolutionRowOperation, Vector4>(
135-
this.Configuration,
136-
interest,
137-
in verticalOperation);
146+
using (IMemoryOwner<Vector4> vBuffer = allocator.Allocate<Vector4>(verticalOperation.GetRequiredBufferLength(interest)))
147+
{
148+
Span<Vector4> vSpan = vBuffer.Memory.Span;
149+
150+
for (int y = interest.Top; y < interest.Bottom; y++)
151+
{
152+
verticalOperation.Invoke(y, vSpan);
153+
}
154+
}
138155
}
139156

140157
/// <summary>

src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Buffers;
45
using System.Numerics;
56
using System.Runtime.CompilerServices;
67
using System.Runtime.InteropServices;
@@ -96,10 +97,17 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
9697
map.BuildSamplingOffsetMap(this.KernelXY.Rows, this.KernelXY.Columns, interest, this.BorderWrapModeX, this.BorderWrapModeY);
9798

9899
RowOperation operation = new(interest, targetPixels, source.PixelBuffer, map, this.KernelXY, this.Configuration, this.PreserveAlpha);
99-
ParallelRowIterator.IterateRows<RowOperation, Vector4>(
100-
this.Configuration,
101-
interest,
102-
in operation);
100+
101+
// Convolution is memory-bandwidth-bound with low arithmetic intensity.
102+
// Parallelization degrades performance due to cache line contention from
103+
// overlapping source row reads. See #3111.
104+
using IMemoryOwner<Vector4> buffer = allocator.Allocate<Vector4>(operation.GetRequiredBufferLength(interest));
105+
Span<Vector4> span = buffer.Memory.Span;
106+
107+
for (int y = interest.Top; y < interest.Bottom; y++)
108+
{
109+
operation.Invoke(y, span);
110+
}
103111
}
104112

105113
Buffer2D<TPixel>.SwapOrCopyContent(source.PixelBuffer, targetPixels);

src/ImageSharp/Processing/Processors/Convolution/EdgeDetectorCompassProcessor{TPixel}.cs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,15 @@ protected override void OnFrameApply(ImageFrame<TPixel> source)
8383
processor.Apply(pass);
8484
}
8585

86+
// Convolution is memory-bandwidth-bound with low arithmetic intensity.
87+
// Parallelization degrades performance due to cache line contention from
88+
// overlapping source row reads. See #3111.
8689
RowOperation operation = new(source.PixelBuffer, pass.PixelBuffer, interest);
87-
ParallelRowIterator.IterateRows(
88-
this.Configuration,
89-
interest,
90-
in operation);
90+
91+
for (int y = interest.Top; y < interest.Bottom; y++)
92+
{
93+
operation.Invoke(y);
94+
}
9195
}
9296
}
9397

Lines changed: 2 additions & 2 deletions
Loading
Lines changed: 2 additions & 2 deletions
Loading
Lines changed: 2 additions & 2 deletions
Loading
Lines changed: 2 additions & 2 deletions
Loading
Lines changed: 2 additions & 2 deletions
Loading

0 commit comments

Comments
 (0)