rjpeg: honour supports_rgba in YCbCr conversion kernels

LibretroAdmin · LibretroAdmin · commit 1ab5e951c20b · 2026-04-21T19:44:38.000+02:00
rjpeg_process_image declared a supports_rgba parameter but never
read it. The decoder unconditionally emitted BGRA byte order
(reads as ARGB32 on little-endian). When the video driver requires
ABGR -- GLES without BGRA8888, which sets VIDEO_FLAG_USE_RGBA at
gl2.c:4091 -- JPEG images rendered with red and blue swapped.
Sibling decoders (rpng, rtga, rbmp) honoured supports_rgba; rjpeg
was the outlier.

This patch threads supports_rgba through the four YCbCr/upsample
kernel signatures and branches at the critical byte-order decision
inside each kernel. The branch lives out-of-loop (scalar) or at the
cheap swap point (SIMD), so the hot path cost is negligible.

Scalar rjpeg_YCbCr_to_RGB_row: two tight loops under an out-of-loop
if (supports_rgba), matching rpng's pattern.

SSE2 rjpeg_YCbCr_to_RGB_simd and rjpeg_upsample_YCbCr_to_BGRA_simd:
ternary on the first _mm_packus_epi16 argument order --
  packus(bw, rw) -&gt; BGRA byte order (ARGB32 on LE)
  packus(rw, bw) -&gt; RGBA byte order (ABGR32 on LE, supports_rgba)
The rest of the unpack cascade is byte-order-invariant.

NEON rjpeg_YCbCr_to_RGB_simd and rjpeg_upsample_YCbCr_to_BGRA_simd:
vst4_u8 interleaves val[0..3] into successive output bytes, so the
swap is a val[0] &lt;-&gt; val[2] exchange; val[1]=g and val[3]=alpha are
invariant either way.

The inlined scalar tail in rjpeg_YCbCr_to_RGB_simd is replaced with
a delegated call to rjpeg_YCbCr_to_RGB_row, eliminating a duplicated
copy that would otherwise need its own rgba branch. The fused
kernel's existing scalar tail already routed through that function.

supports_rgba arrives at rjpeg_process_image only; it is latched
onto the rjpeg_t handle and read at the four kernel callsites (two
active in rjpeg_process_image, two in the currently-disabled
iter_resample_ready path). Grayscale paths are unaffected: R=G=B=Y
so byte order is identity.

Validated with a standalone test harness on three kernel paths x
three test images (all nine combinations pass):

  paths:  scalar (-mno-sse2), SSE2 (-msse2),
          aarch64 NEON (-static under qemu-aarch64)
  images: 32x32 4:2:0 -- exercises fused upsample SIMD + tail
          36x16 4:2:0 -- chroma_w%8 != 0, forces the SIMD scalar
                         tail delegation
          32x32 4:4:4 -- use_fused=false, forces the non-fused
                         main YCbCr kernel

Two invariants are asserted: byte-swap equivalence across
supports_rgba=false vs true at every pixel, and quadrant-dominance
sanity in both BGRA and RGBA layouts (so that both decodes being
wrong in the same way can't spuriously pass).

Each SIMD rgba branch was additionally path-verified with a
sabotage diagnostic: temporarily forcing val[0]=val[2]=0xaa in
the rgba branch fails exactly the expected tests with
bytes=(170,170,170,255) and the swap invariant on hundreds of
pixels, while leaving the bgra tests and any non-targeted kernel
untouched -- confirming the code is genuinely on the test path
and not dead code shadowed by a fallback.

Unrelated finding, worth a separate patch: rjpeg's NEON auto-detect
tests defined(__ARM_NEON__) only, while modern aarch64-linux-gnu-gcc
defines __ARM_NEON (no trailing underscore). Without an explicit
-D__ARM_NEON__ or -DHAVE_NEON, rjpeg silently runs the scalar
kernel on those builds. rpng accepts both spellings at
libretro-common/formats/png/rpng.c:38; rjpeg should match.
diff --git a/libretro-common/formats/jpeg/rjpeg.c b/libretro-common/formats/jpeg/rjpeg.c
@@ -121,9 +121,14 @@ struct rjpeg
     * This overlaps the two phases and avoids the serial
     * entropy→done→resample pipeline. */
    rjpeg_resample          iter_res[4];       /* per-component resample state  */
-   uint8_t                *iter_output;       /* BGRA output buffer            */
+   uint8_t                *iter_output;       /* RGBA8888 output buffer        */
    unsigned                iter_out_row;      /* next output row to resample   */
    int                     iter_resample_ready; /* 1 = resample state inited   */
+
+   /* Output byte order selector. Latched from the rjpeg_process_image
+    * parameter and consulted by the resample+colorconvert callsites.
+    * false -> BGRA (ARGB32 on LE); true -> RGBA (ABGR32 on LE). */
+   bool                    supports_rgba;
 };
 
 #ifdef _MSC_VER
@@ -241,17 +246,18 @@ struct rjpeg_jpeg_s
    void (*dequant_idct_block_kernel)(uint8_t *out, int out_stride,
          short data[64], uint8_t *dequant);
    void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
-         const uint8_t *pcr, int count, int step);
+         const uint8_t *pcr, int count, int step, bool supports_rgba);
    uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
          uint8_t *in_far, int w, int hs);
-   /* Fused chroma upsample (hv_2) + YCbCr→BGRA.
+   /* Fused chroma upsample (hv_2) + YCbCr->RGBA8888.
     * Eliminates the linebuf write+read round-trip by keeping upsampled
     * chroma in SIMD registers and feeding directly to color math.
+    * Output byte order (BGRA vs RGBA) is selected by supports_rgba.
     * NULL = not available; use separate resample + colorconvert. */
    void (*upsample_YCbCr_to_BGRA_kernel)(uint8_t *out, const uint8_t *y_row,
          uint8_t *cb_near, uint8_t *cb_far,
          uint8_t *cr_near, uint8_t *cr_far,
-         int chroma_w, int out_w);
+         int chroma_w, int out_w, bool supports_rgba);
 
    /* definition of jpeg image component */
    struct
@@ -3183,41 +3189,79 @@ static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
 #define FLOAT2FIXED(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 #endif
 
+/* Scalar YCbCr -> RGBA8888 (BGRA byte order) or RGBA (RGBA byte order).
+ *
+ * Byte order is selected by supports_rgba:
+ *   false -> out[0..3] = B,G,R,0xff  (reads as ARGB32 on LE; default)
+ *   true  -> out[0..3] = R,G,B,0xff  (reads as ABGR32 on LE; needed by
+ *                                     GLES drivers without BGRA8888)
+ *
+ * The branch is hoisted out of the loop: two tight loops rather than
+ * per-pixel branching. Matches rpng's pattern. */
 static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
-      const uint8_t *pcb, const uint8_t *pcr, int count, int step)
+      const uint8_t *pcb, const uint8_t *pcr, int count, int step,
+      bool supports_rgba)
 {
    int i;
-   for (i = 0; i < count; ++i)
+   if (supports_rgba)
    {
-      int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      int r = y_fixed +  cr* FLOAT2FIXED(1.40200f);
-      int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
-      int b = y_fixed                               +   cb* FLOAT2FIXED(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255)
-         r = (r < 0) ? 0 : 255;
-      if ((unsigned) g > 255)
-         g = (g < 0) ? 0 : 255;
-      if ((unsigned) b > 255)
-         b = (b < 0) ? 0 : 255;
-      /* Write BGRA byte order so the uint32 reads as ARGB --
-       * this fuses the old RGBA->ARGB swizzle pass into the
-       * color conversion itself. */
-      out[0] = (uint8_t)b;
-      out[1] = (uint8_t)g;
-      out[2] = (uint8_t)r;
-      out[3] = 255;
-      out += step;
+      for (i = 0; i < count; ++i)
+      {
+         int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
+         int cr = pcr[i] - 128;
+         int cb = pcb[i] - 128;
+         int r = y_fixed +  cr* FLOAT2FIXED(1.40200f);
+         int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
+         int b = y_fixed                               +   cb* FLOAT2FIXED(1.77200f);
+         r >>= 20;
+         g >>= 20;
+         b >>= 20;
+         if ((unsigned) r > 255)
+            r = (r < 0) ? 0 : 255;
+         if ((unsigned) g > 255)
+            g = (g < 0) ? 0 : 255;
+         if ((unsigned) b > 255)
+            b = (b < 0) ? 0 : 255;
+         out[0] = (uint8_t)r;
+         out[1] = (uint8_t)g;
+         out[2] = (uint8_t)b;
+         out[3] = 255;
+         out += step;
+      }
+   }
+   else
+   {
+      for (i = 0; i < count; ++i)
+      {
+         int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
+         int cr = pcr[i] - 128;
+         int cb = pcb[i] - 128;
+         int r = y_fixed +  cr* FLOAT2FIXED(1.40200f);
+         int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
+         int b = y_fixed                               +   cb* FLOAT2FIXED(1.77200f);
+         r >>= 20;
+         g >>= 20;
+         b >>= 20;
+         if ((unsigned) r > 255)
+            r = (r < 0) ? 0 : 255;
+         if ((unsigned) g > 255)
+            g = (g < 0) ? 0 : 255;
+         if ((unsigned) b > 255)
+            b = (b < 0) ? 0 : 255;
+         /* BGRA byte order -- reads as ARGB32 on LE. */
+         out[0] = (uint8_t)b;
+         out[1] = (uint8_t)g;
+         out[2] = (uint8_t)r;
+         out[3] = 255;
+         out += step;
+      }
    }
 }
 
 #if defined(__SSE2__) || defined(RJPEG_NEON)
 static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
-      const uint8_t *pcb, const uint8_t *pcr, int count, int step)
+      const uint8_t *pcb, const uint8_t *pcr, int count, int step,
+      bool supports_rgba)
 {
    int i = 0;
 
@@ -3267,11 +3311,14 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
          __m128i bw = _mm_srai_epi16(bws, 4);
          __m128i gw = _mm_srai_epi16(gws, 4);
 
-         /* back to byte, set up for transpose
-          * Pack B in low half, R in high half (was R,B) so the
-          * interleave produces BGRA byte order directly -- this
-          * eliminates the separate RGBA->ARGB swizzle pass. */
-         __m128i brb = _mm_packus_epi16(bw, rw);
+         /* Back to byte, set up for transpose. The first pack decides
+          * whether R or B goes into the low position of each output pixel:
+          *   packus(bw, rw) -> BGRA byte order (ARGB32 on LE)
+          *   packus(rw, bw) -> RGBA byte order (ABGR32 on LE, supports_rgba)
+          * The rest of the unpack cascade is identical either way. */
+         __m128i brb = supports_rgba
+            ? _mm_packus_epi16(rw, bw)
+            : _mm_packus_epi16(bw, rw);
          __m128i gxb = _mm_packus_epi16(gw, xw);
 
          /* transpose to interleave channels */
@@ -3292,7 +3339,6 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
    /* in this version, step=3 support would be easy to add. but is there demand? */
    if (step == 4)
    {
-      /* this is a fairly straightforward implementation and not super-optimized. */
       uint8x8_t signflip = vdup_n_u8(0x80);
       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
@@ -3324,45 +3370,40 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
          int16x8_t bws = vaddq_s16(yws, cb1);
 
-         /* undo scaling, round, convert to byte
-          * Output BGRA byte order directly to eliminate
-          * the separate RGBA->ARGB swizzle pass. */
-         o.val[0] = vqrshrun_n_s16(bws, 4);
+         /* Undo scaling, round, convert to byte. vst4_u8 interleaves
+          * val[0..3] into successive output bytes; whichever of R/B
+          * we put into val[0] becomes the pixel's low byte:
+          *   val[0]=b, val[2]=r -> BGRA byte order (ARGB32 on LE)
+          *   val[0]=r, val[2]=b -> RGBA byte order (ABGR32 on LE, supports_rgba)
+          * val[1]=g and val[3]=alpha are invariant. */
+         if (supports_rgba)
+         {
+            o.val[0] = vqrshrun_n_s16(rws, 4);
+            o.val[2] = vqrshrun_n_s16(bws, 4);
+         }
+         else
+         {
+            o.val[0] = vqrshrun_n_s16(bws, 4);
+            o.val[2] = vqrshrun_n_s16(rws, 4);
+         }
          o.val[1] = vqrshrun_n_s16(gws, 4);
-         o.val[2] = vqrshrun_n_s16(rws, 4);
          o.val[3] = vdup_n_u8(255);
 
-         /* store, interleaving b/g/r/a */
+         /* store, interleaving low/1/2/alpha */
          vst4_u8(out, o);
          out += 8*4;
       }
    }
 #endif
 
-   for (; i < count; ++i)
-   {
-      int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
-      int cr      = pcr[i] - 128;
-      int cb      = pcb[i] - 128;
-      int r       = y_fixed + cr* FLOAT2FIXED(1.40200f);
-      int g       = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
-      int b       = y_fixed                             +   cb* FLOAT2FIXED(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255)
-         r = (r < 0) ? 0 : 255;
-      if ((unsigned) g > 255)
-         g = (g < 0) ? 0 : 255;
-      if ((unsigned) b > 255)
-         b = (b < 0) ? 0 : 255;
-      /* BGRA byte order -- matches the SIMD paths above */
-      out[0] = (uint8_t)b;
-      out[1] = (uint8_t)g;
-      out[2] = (uint8_t)r;
-      out[3] = 255;
-      out += step;
-   }
+   /* Scalar tail for the remaining pixels the SIMD loop couldn't
+    * consume: count%8 at step==4, or step!=4, or the whole row in
+    * rgba mode on NEON where the SIMD loop was skipped entirely.
+    * Delegating to the scalar kernel keeps the two paths bit-identical
+    * and honours supports_rgba without duplicating the branch here. */
+   if (i < count)
+      rjpeg_YCbCr_to_RGB_row(out, y + i, pcb + i, pcr + i,
+            count - i, step, supports_rgba);
 }
 #endif
 
@@ -3389,22 +3430,22 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
 static void rjpeg_upsample_YCbCr_to_BGRA_row(uint8_t *out, const uint8_t *y_row,
       uint8_t *cb_near, uint8_t *cb_far,
       uint8_t *cr_near, uint8_t *cr_far,
-      int chroma_w, int out_w)
+      int chroma_w, int out_w, bool supports_rgba)
 {
    /* Stack buffers for upsampled chroma (chroma_w*2 output pixels).
     * For typical RetroArch images, chroma_w <= 960 so 1920 bytes each. */
    uint8_t cb_buf[1920], cr_buf[1920];
 
    rjpeg_resample_row_hv_2(cb_buf, cb_near, cb_far, chroma_w, 2);
    rjpeg_resample_row_hv_2(cr_buf, cr_near, cr_far, chroma_w, 2);
-   rjpeg_YCbCr_to_RGB_row(out, y_row, cb_buf, cr_buf, out_w, 4);
+   rjpeg_YCbCr_to_RGB_row(out, y_row, cb_buf, cr_buf, out_w, 4, supports_rgba);
 }
 
 #if defined(__SSE2__) || defined(RJPEG_NEON)
 static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row,
       uint8_t *cb_near, uint8_t *cb_far,
       uint8_t *cr_near, uint8_t *cr_far,
-      int chroma_w, int out_w)
+      int chroma_w, int out_w, bool supports_rgba)
 {
    int i = 0, px = 0;
    int cb_carry = 3 * cb_near[0] + cb_far[0];
@@ -3525,7 +3566,11 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
             bw  = _mm_srai_epi16(bws, 4);
             gw  = _mm_srai_epi16(gws, 4);
 
-            brb = _mm_packus_epi16(bw, rw);
+            /* Swap pack arg order when emitting RGBA byte order; see
+             * rjpeg_YCbCr_to_RGB_simd for the reasoning. */
+            brb = supports_rgba
+               ? _mm_packus_epi16(rw, bw)
+               : _mm_packus_epi16(bw, rw);
             gxb = _mm_packus_epi16(gw, xw);
 
             t0 = _mm_unpacklo_epi8(brb, gxb);
@@ -3561,7 +3606,9 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
             bw  = _mm_srai_epi16(bws, 4);
             gw  = _mm_srai_epi16(gws, 4);
 
-            brb = _mm_packus_epi16(bw, rw);
+            brb = supports_rgba
+               ? _mm_packus_epi16(rw, bw)
+               : _mm_packus_epi16(bw, rw);
             gxb = _mm_packus_epi16(gw, xw);
 
             t0 = _mm_unpacklo_epi8(brb, gxb);
@@ -3653,10 +3700,22 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
                int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
                int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
 
+               /* Match rjpeg_YCbCr_to_RGB_simd: val[0]<->val[2] swap
+                * selects the output byte order. B goes into the
+                * (y + cb1) slot and R into (y + cr0) regardless;
+                * only which output lane they land in changes. */
                uint8x8x4_t o;
-               o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
+               if (supports_rgba)
+               {
+                  o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
+                  o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
+               }
+               else
+               {
+                  o.val[0] = vqrshrun_n_s16(vaddq_s16(yws, cb1), 4);
+                  o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
+               }
                o.val[1] = vqrshrun_n_s16(vaddq_s16(vaddq_s16(yws, cb0), cr1), 4);
-               o.val[2] = vqrshrun_n_s16(vaddq_s16(yws, cr0), 4);
                o.val[3] = vdup_n_u8(255);
 
                vst4_u8(out + (px + j*8)*4, o);
@@ -3706,7 +3765,7 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
 
          if (tail_out > op) tail_out = op;
          rjpeg_YCbCr_to_RGB_row(out + px*4, y_row + px,
-               cb_buf, cr_buf, tail_out, 4);
+               cb_buf, cr_buf, tail_out, 4, supports_rgba);
       }
    }
 }
@@ -4030,12 +4089,14 @@ static void rjpeg_iterate_resample_rows(rjpeg_t *rjpeg, unsigned max_row)
                z->upsample_YCbCr_to_BGRA_kernel(out, y,
                      fused_cb_near, fused_cb_far,
                      fused_cr_near, fused_cr_far,
-                     rjpeg->iter_res[1].w_lores, z->img_x);
+                     rjpeg->iter_res[1].w_lores, z->img_x,
+                     rjpeg->supports_rgba);
             }
             else
             {
                z->YCbCr_to_RGB_kernel(out, y, coutput[1],
-                     coutput[2], z->img_x, 4);
+                     coutput[2], z->img_x, 4,
+                     rjpeg->supports_rgba);
             }
          }
       }
@@ -4454,6 +4515,9 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
    if (!rjpeg)
       return IMAGE_PROCESS_ERROR;
 
+   /* Latch for the kernel callsites below. */
+   rjpeg->supports_rgba = supports_rgba;
+
    /* -----------------------------------------------------------
     * Phase 0 -- DECODE: either use the already-decoded data from
     * the iterative path (rjpeg_iterate_image), or fall back to
@@ -4693,12 +4757,14 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
                      z->upsample_YCbCr_to_BGRA_kernel(out, y,
                            fused_cb_near, fused_cb_far,
                            fused_cr_near, fused_cr_far,
-                           proc->res_comp[1].w_lores, z->img_x);
+                           proc->res_comp[1].w_lores, z->img_x,
+                           rjpeg->supports_rgba);
                   }
                   else
                   {
                      z->YCbCr_to_RGB_kernel(out, y, proc->coutput[1],
-                           proc->coutput[2], z->img_x, proc->n);
+                           proc->coutput[2], z->img_x, proc->n,
+                           rjpeg->supports_rgba);
                   }
                }
                else