Skip to content

Commit 1ab5e95

Browse files
committed
rjpeg: honour supports_rgba in YCbCr conversion kernels
rjpeg_process_image declared a supports_rgba parameter but never read it. The decoder unconditionally emitted BGRA byte order (reads as ARGB32 on little-endian). When the video driver requires ABGR -- GLES without BGRA8888, which sets VIDEO_FLAG_USE_RGBA at gl2.c:4091 -- JPEG images rendered with red and blue swapped. Sibling decoders (rpng, rtga, rbmp) honoured supports_rgba; rjpeg was the outlier. This patch threads supports_rgba through the four YCbCr/upsample kernel signatures and branches at the critical byte-order decision inside each kernel. The branch lives out-of-loop (scalar) or at the cheap swap point (SIMD), so the hot path cost is negligible. Scalar rjpeg_YCbCr_to_RGB_row: two tight loops under an out-of-loop if (supports_rgba), matching rpng's pattern. SSE2 rjpeg_YCbCr_to_RGB_simd and rjpeg_upsample_YCbCr_to_BGRA_simd: ternary on the first _mm_packus_epi16 argument order -- packus(bw, rw) -> BGRA byte order (ARGB32 on LE) packus(rw, bw) -> RGBA byte order (ABGR32 on LE, supports_rgba) The rest of the unpack cascade is byte-order-invariant. NEON rjpeg_YCbCr_to_RGB_simd and rjpeg_upsample_YCbCr_to_BGRA_simd: vst4_u8 interleaves val[0..3] into successive output bytes, so the swap is a val[0] <-> val[2] exchange; val[1]=g and val[3]=alpha are invariant either way. The inlined scalar tail in rjpeg_YCbCr_to_RGB_simd is replaced with a delegated call to rjpeg_YCbCr_to_RGB_row, eliminating a duplicated copy that would otherwise need its own rgba branch. The fused kernel's existing scalar tail already routed through that function. supports_rgba arrives at rjpeg_process_image only; it is latched onto the rjpeg_t handle and read at the four kernel callsites (two active in rjpeg_process_image, two in the currently-disabled iter_resample_ready path). Grayscale paths are unaffected: R=G=B=Y so byte order is identity. Validated with a standalone test harness on three kernel paths x three test images (all nine combinations pass): paths: scalar (-mno-sse2), SSE2 (-msse2), aarch64 NEON (-static under qemu-aarch64) images: 32x32 4:2:0 -- exercises fused upsample SIMD + tail 36x16 4:2:0 -- chroma_w%8 != 0, forces the SIMD scalar tail delegation 32x32 4:4:4 -- use_fused=false, forces the non-fused main YCbCr kernel Two invariants are asserted: byte-swap equivalence across supports_rgba=false vs true at every pixel, and quadrant-dominance sanity in both BGRA and RGBA layouts (so that both decodes being wrong in the same way can't spuriously pass). Each SIMD rgba branch was additionally path-verified with a sabotage diagnostic: temporarily forcing val[0]=val[2]=0xaa in the rgba branch fails exactly the expected tests with bytes=(170,170,170,255) and the swap invariant on hundreds of pixels, while leaving the bgra tests and any non-targeted kernel untouched -- confirming the code is genuinely on the test path and not dead code shadowed by a fallback. Unrelated finding, worth a separate patch: rjpeg's NEON auto-detect tests defined(__ARM_NEON__) only, while modern aarch64-linux-gnu-gcc defines __ARM_NEON (no trailing underscore). Without an explicit -D__ARM_NEON__ or -DHAVE_NEON, rjpeg silently runs the scalar kernel on those builds. rpng accepts both spellings at libretro-common/formats/png/rpng.c:38; rjpeg should match.
1 parent aed4543 commit 1ab5e95

1 file changed

Lines changed: 144 additions & 78 deletions

File tree

  • libretro-common/formats/jpeg

libretro-common/formats/jpeg/rjpeg.c

Lines changed: 144 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,14 @@ struct rjpeg
121121
* This overlaps the two phases and avoids the serial
122122
* entropy→done→resample pipeline. */
123123
rjpeg_resample iter_res[4]; /* per-component resample state */
124-
uint8_t *iter_output; /* BGRA output buffer */
124+
uint8_t *iter_output; /* RGBA8888 output buffer */
125125
unsigned iter_out_row; /* next output row to resample */
126126
int iter_resample_ready; /* 1 = resample state inited */
127+
128+
/* Output byte order selector. Latched from the rjpeg_process_image
129+
* parameter and consulted by the resample+colorconvert callsites.
130+
* false -> BGRA (ARGB32 on LE); true -> RGBA (ABGR32 on LE). */
131+
bool supports_rgba;
127132
};
128133

129134
#ifdef _MSC_VER
@@ -241,17 +246,18 @@ struct rjpeg_jpeg_s
241246
void (*dequant_idct_block_kernel)(uint8_t *out, int out_stride,
242247
short data[64], uint8_t *dequant);
243248
void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
244-
const uint8_t *pcr, int count, int step);
249+
const uint8_t *pcr, int count, int step, bool supports_rgba);
245250
uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
246251
uint8_t *in_far, int w, int hs);
247-
/* Fused chroma upsample (hv_2) + YCbCr→BGRA.
252+
/* Fused chroma upsample (hv_2) + YCbCr->RGBA8888.
248253
* Eliminates the linebuf write+read round-trip by keeping upsampled
249254
* chroma in SIMD registers and feeding directly to color math.
255+
* Output byte order (BGRA vs RGBA) is selected by supports_rgba.
250256
* NULL = not available; use separate resample + colorconvert. */
251257
void (*upsample_YCbCr_to_BGRA_kernel)(uint8_t *out, const uint8_t *y_row,
252258
uint8_t *cb_near, uint8_t *cb_far,
253259
uint8_t *cr_near, uint8_t *cr_far,
254-
int chroma_w, int out_w);
260+
int chroma_w, int out_w, bool supports_rgba);
255261

256262
/* definition of jpeg image component */
257263
struct
@@ -3183,41 +3189,79 @@ static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
31833189
#define FLOAT2FIXED(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
31843190
#endif
31853191

3192+
/* Scalar YCbCr -> RGBA8888 (BGRA byte order) or RGBA (RGBA byte order).
3193+
*
3194+
* Byte order is selected by supports_rgba:
3195+
* false -> out[0..3] = B,G,R,0xff (reads as ARGB32 on LE; default)
3196+
* true -> out[0..3] = R,G,B,0xff (reads as ABGR32 on LE; needed by
3197+
* GLES drivers without BGRA8888)
3198+
*
3199+
* The branch is hoisted out of the loop: two tight loops rather than
3200+
* per-pixel branching. Matches rpng's pattern. */
31863201
static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
3187-
const uint8_t *pcb, const uint8_t *pcr, int count, int step)
3202+
const uint8_t *pcb, const uint8_t *pcr, int count, int step,
3203+
bool supports_rgba)
31883204
{
31893205
int i;
3190-
for (i = 0; i < count; ++i)
3206+
if (supports_rgba)
31913207
{
3192-
int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
3193-
int cr = pcr[i] - 128;
3194-
int cb = pcb[i] - 128;
3195-
int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
3196-
int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
3197-
int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
3198-
r >>= 20;
3199-
g >>= 20;
3200-
b >>= 20;
3201-
if ((unsigned) r > 255)
3202-
r = (r < 0) ? 0 : 255;
3203-
if ((unsigned) g > 255)
3204-
g = (g < 0) ? 0 : 255;
3205-
if ((unsigned) b > 255)
3206-
b = (b < 0) ? 0 : 255;
3207-
/* Write BGRA byte order so the uint32 reads as ARGB --
3208-
* this fuses the old RGBA->ARGB swizzle pass into the
3209-
* color conversion itself. */
3210-
out[0] = (uint8_t)b;
3211-
out[1] = (uint8_t)g;
3212-
out[2] = (uint8_t)r;
3213-
out[3] = 255;
3214-
out += step;
3208+
for (i = 0; i < count; ++i)
3209+
{
3210+
int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
3211+
int cr = pcr[i] - 128;
3212+
int cb = pcb[i] - 128;
3213+
int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
3214+
int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
3215+
int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
3216+
r >>= 20;
3217+
g >>= 20;
3218+
b >>= 20;
3219+
if ((unsigned) r > 255)
3220+
r = (r < 0) ? 0 : 255;
3221+
if ((unsigned) g > 255)
3222+
g = (g < 0) ? 0 : 255;
3223+
if ((unsigned) b > 255)
3224+
b = (b < 0) ? 0 : 255;
3225+
out[0] = (uint8_t)r;
3226+
out[1] = (uint8_t)g;
3227+
out[2] = (uint8_t)b;
3228+
out[3] = 255;
3229+
out += step;
3230+
}
3231+
}
3232+
else
3233+
{
3234+
for (i = 0; i < count; ++i)
3235+
{
3236+
int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
3237+
int cr = pcr[i] - 128;
3238+
int cb = pcb[i] - 128;
3239+
int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
3240+
int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
3241+
int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
3242+
r >>= 20;
3243+
g >>= 20;
3244+
b >>= 20;
3245+
if ((unsigned) r > 255)
3246+
r = (r < 0) ? 0 : 255;
3247+
if ((unsigned) g > 255)
3248+
g = (g < 0) ? 0 : 255;
3249+
if ((unsigned) b > 255)
3250+
b = (b < 0) ? 0 : 255;
3251+
/* BGRA byte order -- reads as ARGB32 on LE. */
3252+
out[0] = (uint8_t)b;
3253+
out[1] = (uint8_t)g;
3254+
out[2] = (uint8_t)r;
3255+
out[3] = 255;
3256+
out += step;
3257+
}
32153258
}
32163259
}
32173260

32183261
#if defined(__SSE2__) || defined(RJPEG_NEON)
32193262
static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
3220-
const uint8_t *pcb, const uint8_t *pcr, int count, int step)
3263+
const uint8_t *pcb, const uint8_t *pcr, int count, int step,
3264+
bool supports_rgba)
32213265
{
32223266
int i = 0;
32233267

@@ -3267,11 +3311,14 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
32673311
__m128i bw = _mm_srai_epi16(bws, 4);
32683312
__m128i gw = _mm_srai_epi16(gws, 4);
32693313

3270-
/* back to byte, set up for transpose
3271-
* Pack B in low half, R in high half (was R,B) so the
3272-
* interleave produces BGRA byte order directly -- this
3273-
* eliminates the separate RGBA->ARGB swizzle pass. */
3274-
__m128i brb = _mm_packus_epi16(bw, rw);
3314+
/* Back to byte, set up for transpose. The first pack decides
3315+
* whether R or B goes into the low position of each output pixel:
3316+
* packus(bw, rw) -> BGRA byte order (ARGB32 on LE)
3317+
* packus(rw, bw) -> RGBA byte order (ABGR32 on LE, supports_rgba)
3318+
* The rest of the unpack cascade is identical either way. */
3319+
__m128i brb = supports_rgba
3320+
? _mm_packus_epi16(rw, bw)
3321+
: _mm_packus_epi16(bw, rw);
32753322
__m128i gxb = _mm_packus_epi16(gw, xw);
32763323

32773324
/* transpose to interleave channels */
@@ -3292,7 +3339,6 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
32923339
/* in this version, step=3 support would be easy to add. but is there demand? */
32933340
if (step == 4)
32943341
{
3295-
/* this is a fairly straightforward implementation and not super-optimized. */
32963342
uint8x8_t signflip = vdup_n_u8(0x80);
32973343
int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
32983344
int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
@@ -3324,45 +3370,40 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
33243370
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
33253371
int16x8_t bws = vaddq_s16(yws, cb1);
33263372

3327-
/* undo scaling, round, convert to byte
3328-
* Output BGRA byte order directly to eliminate
3329-
* the separate RGBA->ARGB swizzle pass. */
3330-
o.val[0] = vqrshrun_n_s16(bws, 4);
3373+
/* Undo scaling, round, convert to byte. vst4_u8 interleaves
3374+
* val[0..3] into successive output bytes; whichever of R/B
3375+
* we put into val[0] becomes the pixel's low byte:
3376+
* val[0]=b, val[2]=r -> BGRA byte order (ARGB32 on LE)
3377+
* val[0]=r, val[2]=b -> RGBA byte order (ABGR32 on LE, supports_rgba)
3378+
* val[1]=g and val[3]=alpha are invariant. */
3379+
if (supports_rgba)
3380+
{
3381+
o.val[0] = vqrshrun_n_s16(rws, 4);
3382+
o.val[2] = vqrshrun_n_s16(bws, 4);
3383+
}
3384+
else
3385+
{
3386+
o.val[0] = vqrshrun_n_s16(bws, 4);
3387+
o.val[2] = vqrshrun_n_s16(rws, 4);
3388+
}
33313389
o.val[1] = vqrshrun_n_s16(gws, 4);
3332-
o.val[2] = vqrshrun_n_s16(rws, 4);
33333390
o.val[3] = vdup_n_u8(255);
33343391

3335-
/* store, interleaving b/g/r/a */
3392+
/* store, interleaving low/1/2/alpha */
33363393
vst4_u8(out, o);
33373394
out += 8*4;
33383395
}
33393396
}
33403397
#endif
33413398

3342-
for (; i < count; ++i)
3343-
{
3344-
int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
3345-
int cr = pcr[i] - 128;
3346-
int cb = pcb[i] - 128;
3347-
int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
3348-
int g = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
3349-
int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
3350-
r >>= 20;
3351-
g >>= 20;
3352-
b >>= 20;
3353-
if ((unsigned) r > 255)
3354-
r = (r < 0) ? 0 : 255;
3355-
if ((unsigned) g > 255)
3356-
g = (g < 0) ? 0 : 255;
3357-
if ((unsigned) b > 255)
3358-
b = (b < 0) ? 0 : 255;
3359-
/* BGRA byte order -- matches the SIMD paths above */
3360-
out[0] = (uint8_t)b;
3361-
out[1] = (uint8_t)g;
3362-
out[2] = (uint8_t)r;
3363-
out[3] = 255;
3364-
out += step;
3365-
}
3399+
/* Scalar tail for the remaining pixels the SIMD loop couldn't
3400+
* consume: count%8 at step==4, or step!=4, or the whole row in
3401+
* rgba mode on NEON where the SIMD loop was skipped entirely.
3402+
* Delegating to the scalar kernel keeps the two paths bit-identical
3403+
* and honours supports_rgba without duplicating the branch here. */
3404+
if (i < count)
3405+
rjpeg_YCbCr_to_RGB_row(out, y + i, pcb + i, pcr + i,
3406+
count - i, step, supports_rgba);
33663407
}
33673408
#endif
33683409

@@ -3389,22 +3430,22 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
33893430
static void rjpeg_upsample_YCbCr_to_BGRA_row(uint8_t *out, const uint8_t *y_row,
33903431
uint8_t *cb_near, uint8_t *cb_far,
33913432
uint8_t *cr_near, uint8_t *cr_far,
3392-
int chroma_w, int out_w)
3433+
int chroma_w, int out_w, bool supports_rgba)
33933434
{
33943435
/* Stack buffers for upsampled chroma (chroma_w*2 output pixels).
33953436
* For typical RetroArch images, chroma_w <= 960 so 1920 bytes each. */
33963437
uint8_t cb_buf[1920], cr_buf[1920];
33973438

33983439
rjpeg_resample_row_hv_2(cb_buf, cb_near, cb_far, chroma_w, 2);
33993440
rjpeg_resample_row_hv_2(cr_buf, cr_near, cr_far, chroma_w, 2);
3400-
rjpeg_YCbCr_to_RGB_row(out, y_row, cb_buf, cr_buf, out_w, 4);
3441+
rjpeg_YCbCr_to_RGB_row(out, y_row, cb_buf, cr_buf, out_w, 4, supports_rgba);
34013442
}
34023443

34033444
#if defined(__SSE2__) || defined(RJPEG_NEON)
34043445
static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row,
34053446
uint8_t *cb_near, uint8_t *cb_far,
34063447
uint8_t *cr_near, uint8_t *cr_far,
3407-
int chroma_w, int out_w)
3448+
int chroma_w, int out_w, bool supports_rgba)
34083449
{
34093450
int i = 0, px = 0;
34103451
int cb_carry = 3 * cb_near[0] + cb_far[0];
@@ -3525,7 +3566,11 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
35253566
bw = _mm_srai_epi16(bws, 4);
35263567
gw = _mm_srai_epi16(gws, 4);
35273568

3528-
brb = _mm_packus_epi16(bw, rw);
3569+
/* Swap pack arg order when emitting RGBA byte order; see
3570+
* rjpeg_YCbCr_to_RGB_simd for the reasoning. */
3571+
brb = supports_rgba
3572+
? _mm_packus_epi16(rw, bw)
3573+
: _mm_packus_epi16(bw, rw);
35293574
gxb = _mm_packus_epi16(gw, xw);
35303575

35313576
t0 = _mm_unpacklo_epi8(brb, gxb);
@@ -3561,7 +3606,9 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
35613606
bw = _mm_srai_epi16(bws, 4);
35623607
gw = _mm_srai_epi16(gws, 4);
35633608

3564-
brb = _mm_packus_epi16(bw, rw);
3609+
brb = supports_rgba
3610+
? _mm_packus_epi16(rw, bw)
3611+
: _mm_packus_epi16(bw, rw);
35653612
gxb = _mm_packus_epi16(gw, xw);
35663613

35673614
t0 = _mm_unpacklo_epi8(brb, gxb);
@@ -3653,10 +3700,22 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
36533700
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
36543701
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
36553702

3703+
/* Match rjpeg_YCbCr_to_RGB_simd: val[0]<->val[2] swap
3704+
* selects the output byte order. B goes into the
3705+
* (y + cb1) slot and R into (y + cr0) regardless;
3706+
* only which output lane they land in changes. */
36563707
uint8x8x4_t o;
3657-
o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
3708+
if (supports_rgba)
3709+
{
3710+
o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
3711+
o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
3712+
}
3713+
else
3714+
{
3715+
o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
3716+
o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
3717+
}
36583718
o.val[1] = vqrshrun_n_s16(vaddq_s16(vaddq_s16(yws, cb0), cr1), 4);
3659-
o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
36603719
o.val[3] = vdup_n_u8(255);
36613720

36623721
vst4_u8(out + (px + j*8)*4, o);
@@ -3706,7 +3765,7 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
37063765

37073766
if (tail_out > op) tail_out = op;
37083767
rjpeg_YCbCr_to_RGB_row(out + px*4, y_row + px,
3709-
cb_buf, cr_buf, tail_out, 4);
3768+
cb_buf, cr_buf, tail_out, 4, supports_rgba);
37103769
}
37113770
}
37123771
}
@@ -4030,12 +4089,14 @@ static void rjpeg_iterate_resample_rows(rjpeg_t *rjpeg, unsigned max_row)
40304089
z->upsample_YCbCr_to_BGRA_kernel(out, y,
40314090
fused_cb_near, fused_cb_far,
40324091
fused_cr_near, fused_cr_far,
4033-
rjpeg->iter_res[1].w_lores, z->img_x);
4092+
rjpeg->iter_res[1].w_lores, z->img_x,
4093+
rjpeg->supports_rgba);
40344094
}
40354095
else
40364096
{
40374097
z->YCbCr_to_RGB_kernel(out, y, coutput[1],
4038-
coutput[2], z->img_x, 4);
4098+
coutput[2], z->img_x, 4,
4099+
rjpeg->supports_rgba);
40394100
}
40404101
}
40414102
}
@@ -4454,6 +4515,9 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
44544515
if (!rjpeg)
44554516
return IMAGE_PROCESS_ERROR;
44564517

4518+
/* Latch for the kernel callsites below. */
4519+
rjpeg->supports_rgba = supports_rgba;
4520+
44574521
/* -----------------------------------------------------------
44584522
* Phase 0 -- DECODE: either use the already-decoded data from
44594523
* the iterative path (rjpeg_iterate_image), or fall back to
@@ -4693,12 +4757,14 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
46934757
z->upsample_YCbCr_to_BGRA_kernel(out, y,
46944758
fused_cb_near, fused_cb_far,
46954759
fused_cr_near, fused_cr_far,
4696-
proc->res_comp[1].w_lores, z->img_x);
4760+
proc->res_comp[1].w_lores, z->img_x,
4761+
rjpeg->supports_rgba);
46974762
}
46984763
else
46994764
{
47004765
z->YCbCr_to_RGB_kernel(out, y, proc->coutput[1],
4701-
proc->coutput[2], z->img_x, proc->n);
4766+
proc->coutput[2], z->img_x, proc->n,
4767+
rjpeg->supports_rgba);
47024768
}
47034769
}
47044770
else

0 commit comments

Comments
 (0)