@@ -121,9 +121,14 @@ struct rjpeg
121121 * This overlaps the two phases and avoids the serial
122122 * entropy→done→resample pipeline. */
123123 rjpeg_resample iter_res [4 ]; /* per-component resample state */
124- uint8_t * iter_output ; /* BGRA output buffer */
124+ uint8_t * iter_output ; /* RGBA8888 output buffer */
125125 unsigned iter_out_row ; /* next output row to resample */
126126 int iter_resample_ready ; /* 1 = resample state inited */
127+
128+ /* Output byte order selector. Latched from the rjpeg_process_image
129+ * parameter and consulted by the resample+colorconvert callsites.
130+ * false -> BGRA (ARGB32 on LE); true -> RGBA (ABGR32 on LE). */
131+ bool supports_rgba ;
127132};
128133
129134#ifdef _MSC_VER
@@ -241,17 +246,18 @@ struct rjpeg_jpeg_s
241246 void (* dequant_idct_block_kernel )(uint8_t * out , int out_stride ,
242247 short data [64 ], uint8_t * dequant );
243248 void (* YCbCr_to_RGB_kernel )(uint8_t * out , const uint8_t * y , const uint8_t * pcb ,
244- const uint8_t * pcr , int count , int step );
249+ const uint8_t * pcr , int count , int step , bool supports_rgba );
245250 uint8_t * (* resample_row_hv_2_kernel )(uint8_t * out , uint8_t * in_near ,
246251 uint8_t * in_far , int w , int hs );
247- /* Fused chroma upsample (hv_2) + YCbCr→BGRA .
252+ /* Fused chroma upsample (hv_2) + YCbCr->RGBA8888 .
248253 * Eliminates the linebuf write+read round-trip by keeping upsampled
249254 * chroma in SIMD registers and feeding directly to color math.
255+ * Output byte order (BGRA vs RGBA) is selected by supports_rgba.
250256 * NULL = not available; use separate resample + colorconvert. */
251257 void (* upsample_YCbCr_to_BGRA_kernel )(uint8_t * out , const uint8_t * y_row ,
252258 uint8_t * cb_near , uint8_t * cb_far ,
253259 uint8_t * cr_near , uint8_t * cr_far ,
254- int chroma_w , int out_w );
260+ int chroma_w , int out_w , bool supports_rgba );
255261
256262 /* definition of jpeg image component */
257263 struct
@@ -3183,41 +3189,79 @@ static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
31833189#define FLOAT2FIXED (x ) (((int) ((x) * 4096.0f + 0.5f)) << 8)
31843190#endif
31853191
3192+ /* Scalar YCbCr -> RGBA8888 (BGRA byte order) or RGBA (RGBA byte order).
3193+ *
3194+ * Byte order is selected by supports_rgba:
3195+ * false -> out[0..3] = B,G,R,0xff (reads as ARGB32 on LE; default)
3196+ * true -> out[0..3] = R,G,B,0xff (reads as ABGR32 on LE; needed by
3197+ * GLES drivers without BGRA8888)
3198+ *
3199+ * The branch is hoisted out of the loop: two tight loops rather than
3200+ * per-pixel branching. Matches rpng's pattern. */
31863201static void rjpeg_YCbCr_to_RGB_row (uint8_t * out , const uint8_t * y ,
3187- const uint8_t * pcb , const uint8_t * pcr , int count , int step )
3202+ const uint8_t * pcb , const uint8_t * pcr , int count , int step ,
3203+ bool supports_rgba )
31883204{
31893205 int i ;
3190- for ( i = 0 ; i < count ; ++ i )
3206+ if ( supports_rgba )
31913207 {
3192- int y_fixed = (y [i ] << 20 ) + (1 <<19 ); /* rounding */
3193- int cr = pcr [i ] - 128 ;
3194- int cb = pcb [i ] - 128 ;
3195- int r = y_fixed + cr * FLOAT2FIXED (1.40200f );
3196- int g = y_fixed + (cr * - FLOAT2FIXED (0.71414f )) + ((cb * - FLOAT2FIXED (0.34414f )) & 0xffff0000 );
3197- int b = y_fixed + cb * FLOAT2FIXED (1.77200f );
3198- r >>= 20 ;
3199- g >>= 20 ;
3200- b >>= 20 ;
3201- if ((unsigned ) r > 255 )
3202- r = (r < 0 ) ? 0 : 255 ;
3203- if ((unsigned ) g > 255 )
3204- g = (g < 0 ) ? 0 : 255 ;
3205- if ((unsigned ) b > 255 )
3206- b = (b < 0 ) ? 0 : 255 ;
3207- /* Write BGRA byte order so the uint32 reads as ARGB --
3208- * this fuses the old RGBA->ARGB swizzle pass into the
3209- * color conversion itself. */
3210- out [0 ] = (uint8_t )b ;
3211- out [1 ] = (uint8_t )g ;
3212- out [2 ] = (uint8_t )r ;
3213- out [3 ] = 255 ;
3214- out += step ;
3208+ for (i = 0 ; i < count ; ++ i )
3209+ {
3210+ int y_fixed = (y [i ] << 20 ) + (1 <<19 ); /* rounding */
3211+ int cr = pcr [i ] - 128 ;
3212+ int cb = pcb [i ] - 128 ;
3213+ int r = y_fixed + cr * FLOAT2FIXED (1.40200f );
3214+ int g = y_fixed + (cr * - FLOAT2FIXED (0.71414f )) + ((cb * - FLOAT2FIXED (0.34414f )) & 0xffff0000 );
3215+ int b = y_fixed + cb * FLOAT2FIXED (1.77200f );
3216+ r >>= 20 ;
3217+ g >>= 20 ;
3218+ b >>= 20 ;
3219+ if ((unsigned ) r > 255 )
3220+ r = (r < 0 ) ? 0 : 255 ;
3221+ if ((unsigned ) g > 255 )
3222+ g = (g < 0 ) ? 0 : 255 ;
3223+ if ((unsigned ) b > 255 )
3224+ b = (b < 0 ) ? 0 : 255 ;
3225+ out [0 ] = (uint8_t )r ;
3226+ out [1 ] = (uint8_t )g ;
3227+ out [2 ] = (uint8_t )b ;
3228+ out [3 ] = 255 ;
3229+ out += step ;
3230+ }
3231+ }
3232+ else
3233+ {
3234+ for (i = 0 ; i < count ; ++ i )
3235+ {
3236+ int y_fixed = (y [i ] << 20 ) + (1 <<19 ); /* rounding */
3237+ int cr = pcr [i ] - 128 ;
3238+ int cb = pcb [i ] - 128 ;
3239+ int r = y_fixed + cr * FLOAT2FIXED (1.40200f );
3240+ int g = y_fixed + (cr * - FLOAT2FIXED (0.71414f )) + ((cb * - FLOAT2FIXED (0.34414f )) & 0xffff0000 );
3241+ int b = y_fixed + cb * FLOAT2FIXED (1.77200f );
3242+ r >>= 20 ;
3243+ g >>= 20 ;
3244+ b >>= 20 ;
3245+ if ((unsigned ) r > 255 )
3246+ r = (r < 0 ) ? 0 : 255 ;
3247+ if ((unsigned ) g > 255 )
3248+ g = (g < 0 ) ? 0 : 255 ;
3249+ if ((unsigned ) b > 255 )
3250+ b = (b < 0 ) ? 0 : 255 ;
3251+ /* BGRA byte order -- reads as ARGB32 on LE. */
3252+ out [0 ] = (uint8_t )b ;
3253+ out [1 ] = (uint8_t )g ;
3254+ out [2 ] = (uint8_t )r ;
3255+ out [3 ] = 255 ;
3256+ out += step ;
3257+ }
32153258 }
32163259}
32173260
32183261#if defined(__SSE2__ ) || defined(RJPEG_NEON )
32193262static void rjpeg_YCbCr_to_RGB_simd (uint8_t * out , const uint8_t * y ,
3220- const uint8_t * pcb , const uint8_t * pcr , int count , int step )
3263+ const uint8_t * pcb , const uint8_t * pcr , int count , int step ,
3264+ bool supports_rgba )
32213265{
32223266 int i = 0 ;
32233267
@@ -3267,11 +3311,14 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
32673311 __m128i bw = _mm_srai_epi16 (bws , 4 );
32683312 __m128i gw = _mm_srai_epi16 (gws , 4 );
32693313
3270- /* back to byte, set up for transpose
3271- * Pack B in low half, R in high half (was R,B) so the
3272- * interleave produces BGRA byte order directly -- this
3273- * eliminates the separate RGBA->ARGB swizzle pass. */
3274- __m128i brb = _mm_packus_epi16 (bw , rw );
3314+ /* Back to byte, set up for transpose. The first pack decides
3315+ * whether R or B goes into the low position of each output pixel:
3316+ * packus(bw, rw) -> BGRA byte order (ARGB32 on LE)
3317+ * packus(rw, bw) -> RGBA byte order (ABGR32 on LE, supports_rgba)
3318+ * The rest of the unpack cascade is identical either way. */
3319+ __m128i brb = supports_rgba
3320+ ? _mm_packus_epi16 (rw , bw )
3321+ : _mm_packus_epi16 (bw , rw );
32753322 __m128i gxb = _mm_packus_epi16 (gw , xw );
32763323
32773324 /* transpose to interleave channels */
@@ -3292,7 +3339,6 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
32923339 /* in this version, step=3 support would be easy to add. but is there demand? */
32933340 if (step == 4 )
32943341 {
3295- /* this is a fairly straightforward implementation and not super-optimized. */
32963342 uint8x8_t signflip = vdup_n_u8 (0x80 );
32973343 int16x8_t cr_const0 = vdupq_n_s16 ( (short ) ( 1.40200f * 4096.0f + 0.5f ));
32983344 int16x8_t cr_const1 = vdupq_n_s16 ( - (short ) ( 0.71414f * 4096.0f + 0.5f ));
@@ -3324,45 +3370,40 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
33243370 int16x8_t gws = vaddq_s16 (vaddq_s16 (yws , cb0 ), cr1 );
33253371 int16x8_t bws = vaddq_s16 (yws , cb1 );
33263372
3327- /* undo scaling, round, convert to byte
3328- * Output BGRA byte order directly to eliminate
3329- * the separate RGBA->ARGB swizzle pass. */
3330- o .val [0 ] = vqrshrun_n_s16 (bws , 4 );
3373+ /* Undo scaling, round, convert to byte. vst4_u8 interleaves
3374+ * val[0..3] into successive output bytes; whichever of R/B
3375+ * we put into val[0] becomes the pixel's low byte:
3376+ * val[0]=b, val[2]=r -> BGRA byte order (ARGB32 on LE)
3377+ * val[0]=r, val[2]=b -> RGBA byte order (ABGR32 on LE, supports_rgba)
3378+ * val[1]=g and val[3]=alpha are invariant. */
3379+ if (supports_rgba )
3380+ {
3381+ o .val [0 ] = vqrshrun_n_s16 (rws , 4 );
3382+ o .val [2 ] = vqrshrun_n_s16 (bws , 4 );
3383+ }
3384+ else
3385+ {
3386+ o .val [0 ] = vqrshrun_n_s16 (bws , 4 );
3387+ o .val [2 ] = vqrshrun_n_s16 (rws , 4 );
3388+ }
33313389 o .val [1 ] = vqrshrun_n_s16 (gws , 4 );
3332- o .val [2 ] = vqrshrun_n_s16 (rws , 4 );
33333390 o .val [3 ] = vdup_n_u8 (255 );
33343391
3335- /* store, interleaving b/g/r/a */
3392+ /* store, interleaving low/1/2/alpha */
33363393 vst4_u8 (out , o );
33373394 out += 8 * 4 ;
33383395 }
33393396 }
33403397#endif
33413398
3342- for (; i < count ; ++ i )
3343- {
3344- int y_fixed = (y [i ] << 20 ) + (1 <<19 ); /* rounding */
3345- int cr = pcr [i ] - 128 ;
3346- int cb = pcb [i ] - 128 ;
3347- int r = y_fixed + cr * FLOAT2FIXED (1.40200f );
3348- int g = y_fixed + cr * - FLOAT2FIXED (0.71414f ) + ((cb * - FLOAT2FIXED (0.34414f )) & 0xffff0000 );
3349- int b = y_fixed + cb * FLOAT2FIXED (1.77200f );
3350- r >>= 20 ;
3351- g >>= 20 ;
3352- b >>= 20 ;
3353- if ((unsigned ) r > 255 )
3354- r = (r < 0 ) ? 0 : 255 ;
3355- if ((unsigned ) g > 255 )
3356- g = (g < 0 ) ? 0 : 255 ;
3357- if ((unsigned ) b > 255 )
3358- b = (b < 0 ) ? 0 : 255 ;
3359- /* BGRA byte order -- matches the SIMD paths above */
3360- out [0 ] = (uint8_t )b ;
3361- out [1 ] = (uint8_t )g ;
3362- out [2 ] = (uint8_t )r ;
3363- out [3 ] = 255 ;
3364- out += step ;
3365- }
3399+ /* Scalar tail for the remaining pixels the SIMD loop couldn't
3400+ * consume: count%8 at step==4, or step!=4, or the whole row in
3401+ * rgba mode on NEON where the SIMD loop was skipped entirely.
3402+ * Delegating to the scalar kernel keeps the two paths bit-identical
3403+ * and honours supports_rgba without duplicating the branch here. */
3404+ if (i < count )
3405+ rjpeg_YCbCr_to_RGB_row (out , y + i , pcb + i , pcr + i ,
3406+ count - i , step , supports_rgba );
33663407}
33673408#endif
33683409
@@ -3389,22 +3430,22 @@ static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
33893430static void rjpeg_upsample_YCbCr_to_BGRA_row (uint8_t * out , const uint8_t * y_row ,
33903431 uint8_t * cb_near , uint8_t * cb_far ,
33913432 uint8_t * cr_near , uint8_t * cr_far ,
3392- int chroma_w , int out_w )
3433+ int chroma_w , int out_w , bool supports_rgba )
33933434{
33943435 /* Stack buffers for upsampled chroma (chroma_w*2 output pixels).
33953436 * For typical RetroArch images, chroma_w <= 960 so 1920 bytes each. */
33963437 uint8_t cb_buf [1920 ], cr_buf [1920 ];
33973438
33983439 rjpeg_resample_row_hv_2 (cb_buf , cb_near , cb_far , chroma_w , 2 );
33993440 rjpeg_resample_row_hv_2 (cr_buf , cr_near , cr_far , chroma_w , 2 );
3400- rjpeg_YCbCr_to_RGB_row (out , y_row , cb_buf , cr_buf , out_w , 4 );
3441+ rjpeg_YCbCr_to_RGB_row (out , y_row , cb_buf , cr_buf , out_w , 4 , supports_rgba );
34013442}
34023443
34033444#if defined(__SSE2__ ) || defined(RJPEG_NEON )
34043445static void rjpeg_upsample_YCbCr_to_BGRA_simd (uint8_t * out , const uint8_t * y_row ,
34053446 uint8_t * cb_near , uint8_t * cb_far ,
34063447 uint8_t * cr_near , uint8_t * cr_far ,
3407- int chroma_w , int out_w )
3448+ int chroma_w , int out_w , bool supports_rgba )
34083449{
34093450 int i = 0 , px = 0 ;
34103451 int cb_carry = 3 * cb_near [0 ] + cb_far [0 ];
@@ -3525,7 +3566,11 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
35253566 bw = _mm_srai_epi16 (bws , 4 );
35263567 gw = _mm_srai_epi16 (gws , 4 );
35273568
3528- brb = _mm_packus_epi16 (bw , rw );
3569+ /* Swap pack arg order when emitting RGBA byte order; see
3570+ * rjpeg_YCbCr_to_RGB_simd for the reasoning. */
3571+ brb = supports_rgba
3572+ ? _mm_packus_epi16 (rw , bw )
3573+ : _mm_packus_epi16 (bw , rw );
35293574 gxb = _mm_packus_epi16 (gw , xw );
35303575
35313576 t0 = _mm_unpacklo_epi8 (brb , gxb );
@@ -3561,7 +3606,9 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
35613606 bw = _mm_srai_epi16 (bws , 4 );
35623607 gw = _mm_srai_epi16 (gws , 4 );
35633608
3564- brb = _mm_packus_epi16 (bw , rw );
3609+ brb = supports_rgba
3610+ ? _mm_packus_epi16 (rw , bw )
3611+ : _mm_packus_epi16 (bw , rw );
35653612 gxb = _mm_packus_epi16 (gw , xw );
35663613
35673614 t0 = _mm_unpacklo_epi8 (brb , gxb );
@@ -3653,10 +3700,22 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
36533700 int16x8_t cr1 = vqdmulhq_s16 (crw , cr_const1 );
36543701 int16x8_t cb1 = vqdmulhq_s16 (cbw , cb_const1 );
36553702
3703+ /* Match rjpeg_YCbCr_to_RGB_simd: val[0]<->val[2] swap
3704+ * selects the output byte order. B goes into the
3705+ * (y + cb1) slot and R into (y + cr0) regardless;
3706+ * only which output lane they land in changes. */
36563707 uint8x8x4_t o ;
3657- o .val [0 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cb1 ), 4 );
3708+ if (supports_rgba )
3709+ {
3710+ o .val [0 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cr0 ), 4 );
3711+ o .val [2 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cb1 ), 4 );
3712+ }
3713+ else
3714+ {
3715+ o .val [0 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cb1 ), 4 );
3716+ o .val [2 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cr0 ), 4 );
3717+ }
36583718 o .val [1 ] = vqrshrun_n_s16 (vaddq_s16 (vaddq_s16 (yws , cb0 ), cr1 ), 4 );
3659- o .val [2 ] = vqrshrun_n_s16 (vaddq_s16 (yws , cr0 ), 4 );
36603719 o .val [3 ] = vdup_n_u8 (255 );
36613720
36623721 vst4_u8 (out + (px + j * 8 )* 4 , o );
@@ -3706,7 +3765,7 @@ static void rjpeg_upsample_YCbCr_to_BGRA_simd(uint8_t *out, const uint8_t *y_row
37063765
37073766 if (tail_out > op ) tail_out = op ;
37083767 rjpeg_YCbCr_to_RGB_row (out + px * 4 , y_row + px ,
3709- cb_buf , cr_buf , tail_out , 4 );
3768+ cb_buf , cr_buf , tail_out , 4 , supports_rgba );
37103769 }
37113770 }
37123771}
@@ -4030,12 +4089,14 @@ static void rjpeg_iterate_resample_rows(rjpeg_t *rjpeg, unsigned max_row)
40304089 z -> upsample_YCbCr_to_BGRA_kernel (out , y ,
40314090 fused_cb_near , fused_cb_far ,
40324091 fused_cr_near , fused_cr_far ,
4033- rjpeg -> iter_res [1 ].w_lores , z -> img_x );
4092+ rjpeg -> iter_res [1 ].w_lores , z -> img_x ,
4093+ rjpeg -> supports_rgba );
40344094 }
40354095 else
40364096 {
40374097 z -> YCbCr_to_RGB_kernel (out , y , coutput [1 ],
4038- coutput [2 ], z -> img_x , 4 );
4098+ coutput [2 ], z -> img_x , 4 ,
4099+ rjpeg -> supports_rgba );
40394100 }
40404101 }
40414102 }
@@ -4454,6 +4515,9 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
44544515 if (!rjpeg )
44554516 return IMAGE_PROCESS_ERROR ;
44564517
4518+ /* Latch for the kernel callsites below. */
4519+ rjpeg -> supports_rgba = supports_rgba ;
4520+
44574521 /* -----------------------------------------------------------
44584522 * Phase 0 -- DECODE: either use the already-decoded data from
44594523 * the iterative path (rjpeg_iterate_image), or fall back to
@@ -4693,12 +4757,14 @@ int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
46934757 z -> upsample_YCbCr_to_BGRA_kernel (out , y ,
46944758 fused_cb_near , fused_cb_far ,
46954759 fused_cr_near , fused_cr_far ,
4696- proc -> res_comp [1 ].w_lores , z -> img_x );
4760+ proc -> res_comp [1 ].w_lores , z -> img_x ,
4761+ rjpeg -> supports_rgba );
46974762 }
46984763 else
46994764 {
47004765 z -> YCbCr_to_RGB_kernel (out , y , proc -> coutput [1 ],
4701- proc -> coutput [2 ], z -> img_x , proc -> n );
4766+ proc -> coutput [2 ], z -> img_x , proc -> n ,
4767+ rjpeg -> supports_rgba );
47024768 }
47034769 }
47044770 else
0 commit comments