@@ -67,24 +67,28 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
6767#include <windows.h>
6868#endif
6969
70- /* SIMD support for str.translate optimization */
70+ /* SIMD support for str.translate optimization.
71+ *
72+ * Uses GCC/Clang vector extensions for portable SIMD code.
73+ * Only the shuffle operation (table lookup with runtime indices)
74+ * requires platform-specific intrinsics.
75+ */
7176#if defined(__GNUC__ ) || defined(__clang__ )
72- # if defined(__x86_64__ )
73- # include <emmintrin.h> /* SSE2 - baseline for x86-64 */
74- # ifdef __SSSE3__
75- # include <tmmintrin.h> /* SSSE3 for pshufb */
76- # define _Py_TRANSLATE_HAVE_SSSE3 1
77- # endif
78- # ifdef __AVX2__
79- # include <immintrin.h>
80- # define _Py_TRANSLATE_HAVE_AVX2 1
81- # endif
77+ # if defined(__x86_64__ ) && defined(__SSSE3__ )
78+ # include <tmmintrin.h>
79+ # define _Py_TRANSLATE_HAVE_SIMD 1
8280# elif defined(__aarch64__ )
8381# include <arm_neon.h>
84- # define _Py_TRANSLATE_HAVE_NEON 1
82+ # define _Py_TRANSLATE_HAVE_SIMD 1
8583# endif
8684#endif
8785
86+ #ifdef _Py_TRANSLATE_HAVE_SIMD
87+ /* Portable 16-byte vector types */
88+ typedef uint8_t _Py_vec16u8 __attribute__((vector_size (16 ), aligned (1 )));
89+ typedef int8_t _Py_vec16i8 __attribute__((vector_size (16 ), aligned (1 )));
90+ #endif
91+
8892#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
8993# include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
9094#endif
@@ -9283,203 +9287,94 @@ unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
92839287}
92849288
92859289/*
9286- * SIMD-accelerated ASCII translation helpers.
9287- *
9288- * These use the nibble-split lookup technique to perform parallel table
9289- * lookups using SSSE3 pshufb, AVX2 vpshufb, or NEON tbl instructions.
9290+ * SIMD-accelerated ASCII translation helper.
92909291 *
9291- * The 128-byte ASCII table is reorganized into 8 rows of 16 bytes each,
9292+ * Uses the nibble-split lookup technique to perform parallel table lookups.
9293+ * The 128-byte ASCII table is organized as 8 rows of 16 bytes each,
92929294 * indexed by the high nibble (bits 6:4) to select the row and the low
92939295 * nibble (bits 3:0) to select the column within the row.
9296+ *
9297+ * This implementation uses GCC/Clang vector extensions for portable code,
9298+ * with only the shuffle operation requiring platform-specific intrinsics.
92949299 */
92959300
9296- #if defined(_Py_TRANSLATE_HAVE_SSSE3 ) || defined(_Py_TRANSLATE_HAVE_AVX2 )
9297- /* Reorganize table for nibble-split lookup: 8 rows x 16 columns */
9298- static inline void
9299- _Py_translate_reorganize_table (const Py_UCS1 table [128 ],
9300- Py_UCS1 reorg [8 ][16 ])
9301- {
9302- for (int row = 0 ; row < 8 ; row ++ ) {
9303- for (int col = 0 ; col < 16 ; col ++ ) {
9304- reorg [row ][col ] = table [row * 16 + col ];
9305- }
9306- }
9307- }
9308- #endif
9301+ #ifdef _Py_TRANSLATE_HAVE_SIMD
93099302
9310- #ifdef _Py_TRANSLATE_HAVE_SSSE3
9311- /*
9312- * SSSE3 implementation using pshufb for parallel table lookup.
9313- * Processes 16 bytes at a time.
9314- */
9315- static inline void
9316- _Py_translate_ssse3 (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9317- const Py_UCS1 reorg [8 ][16 ])
9303+ /* Platform-specific shuffle: result[i] = table[indices[i] & 0x0F] */
9304+ static inline _Py_vec16u8
9305+ _Py_vec_shuffle (_Py_vec16u8 table , _Py_vec16u8 indices )
93189306{
9319- /* Load the 8 lookup sub-tables */
9320- __m128i tables [8 ];
9321- for (int i = 0 ; i < 8 ; i ++ ) {
9322- tables [i ] = _mm_loadu_si128 ((const __m128i * )reorg [i ]);
9323- }
9324-
9325- __m128i low_nibble_mask = _mm_set1_epi8 (0x0F );
9326-
9327- Py_ssize_t i = 0 ;
9328-
9329- /* Main SIMD loop - 16 bytes per iteration */
9330- while (i + 16 <= len ) {
9331- __m128i input = _mm_loadu_si128 ((const __m128i * )(in + i ));
9332-
9333- /* Extract nibbles */
9334- __m128i low_nibble = _mm_and_si128 (input , low_nibble_mask );
9335- __m128i high_nibble = _mm_and_si128 (
9336- _mm_srli_epi16 (input , 4 ), low_nibble_mask );
9337-
9338- /* Perform lookups and blend based on high nibble */
9339- __m128i result = _mm_setzero_si128 ();
9340-
9341- for (int r = 0 ; r < 8 ; r ++ ) {
9342- __m128i lookup = _mm_shuffle_epi8 (tables [r ], low_nibble );
9343- __m128i row_val = _mm_set1_epi8 (r );
9344- __m128i mask = _mm_cmpeq_epi8 (high_nibble , row_val );
9345- result = _mm_or_si128 (
9346- _mm_and_si128 (lookup , mask ),
9347- _mm_andnot_si128 (mask , result )
9348- );
9349- }
9350-
9351- _mm_storeu_si128 ((__m128i * )(out + i ), result );
9352- i += 16 ;
9353- }
9354-
9355- /* Scalar tail */
9356- while (i < len ) {
9357- Py_UCS1 ch = in [i ];
9358- out [i ] = reorg [ch >> 4 ][ch & 0x0F ];
9359- i ++ ;
9360- }
9307+ #if defined(__x86_64__ ) && defined(__SSSE3__ )
9308+ return (_Py_vec16u8 )_mm_shuffle_epi8 ((__m128i )table , (__m128i )indices );
9309+ #elif defined(__aarch64__ )
9310+ return (_Py_vec16u8 )vqtbl1q_u8 ((uint8x16_t )table , (uint8x16_t )indices );
9311+ #endif
93619312}
9362- #endif /* _Py_TRANSLATE_HAVE_SSSE3 */
93639313
9364- #ifdef _Py_TRANSLATE_HAVE_AVX2
9365- /*
9366- * AVX2 implementation - processes 32 bytes at a time.
9367- * Uses the same nibble-split technique with 256-bit vectors.
9368- */
9369- static inline void
9370- _Py_translate_avx2 (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9371- const Py_UCS1 reorg [8 ][16 ])
9314+ /* Platform-specific right shift by 4 bits */
9315+ static inline _Py_vec16u8
9316+ _Py_vec_shift_right_4 (_Py_vec16u8 v )
93729317{
9373- /* Load tables, duplicated for AVX2 */
9374- __m256i tables [8 ];
9375- for (int i = 0 ; i < 8 ; i ++ ) {
9376- __m128i t = _mm_loadu_si128 ((const __m128i * )reorg [i ]);
9377- tables [i ] = _mm256_broadcastsi128_si256 (t );
9378- }
9379-
9380- __m256i low_nibble_mask = _mm256_set1_epi8 (0x0F );
9381-
9382- Py_ssize_t i = 0 ;
9383-
9384- /* Main SIMD loop - 32 bytes per iteration */
9385- while (i + 32 <= len ) {
9386- __m256i input = _mm256_loadu_si256 ((const __m256i * )(in + i ));
9387-
9388- __m256i low_nibble = _mm256_and_si256 (input , low_nibble_mask );
9389- __m256i high_nibble = _mm256_and_si256 (
9390- _mm256_srli_epi16 (input , 4 ), low_nibble_mask );
9391-
9392- __m256i result = _mm256_setzero_si256 ();
9393-
9394- for (int r = 0 ; r < 8 ; r ++ ) {
9395- __m256i lookup = _mm256_shuffle_epi8 (tables [r ], low_nibble );
9396- __m256i row_val = _mm256_set1_epi8 (r );
9397- __m256i mask = _mm256_cmpeq_epi8 (high_nibble , row_val );
9398- result = _mm256_or_si256 (
9399- _mm256_and_si256 (lookup , mask ),
9400- _mm256_andnot_si256 (mask , result )
9401- );
9402- }
9403-
9404- _mm256_storeu_si256 ((__m256i * )(out + i ), result );
9405- i += 32 ;
9406- }
9407-
9408- /* Handle remaining 16+ bytes with SSE */
9409- #ifdef _Py_TRANSLATE_HAVE_SSSE3
9410- if (i + 16 <= len ) {
9411- __m128i tables128 [8 ];
9412- for (int j = 0 ; j < 8 ; j ++ ) {
9413- tables128 [j ] = _mm_loadu_si128 ((const __m128i * )reorg [j ]);
9414- }
9415-
9416- __m128i input = _mm_loadu_si128 ((const __m128i * )(in + i ));
9417- __m128i low_nibble = _mm_and_si128 (input , _mm_set1_epi8 (0x0F ));
9418- __m128i high_nibble = _mm_and_si128 (
9419- _mm_srli_epi16 (input , 4 ), _mm_set1_epi8 (0x0F ));
9420-
9421- __m128i result = _mm_setzero_si128 ();
9422- for (int r = 0 ; r < 8 ; r ++ ) {
9423- __m128i lookup = _mm_shuffle_epi8 (tables128 [r ], low_nibble );
9424- __m128i mask = _mm_cmpeq_epi8 (high_nibble , _mm_set1_epi8 (r ));
9425- result = _mm_or_si128 (
9426- _mm_and_si128 (lookup , mask ),
9427- _mm_andnot_si128 (mask , result )
9428- );
9429- }
9430- _mm_storeu_si128 ((__m128i * )(out + i ), result );
9431- i += 16 ;
9432- }
9318+ #if defined(__x86_64__ ) && defined(__SSSE3__ )
9319+ return (_Py_vec16u8 )_mm_srli_epi16 ((__m128i )v , 4 );
9320+ #elif defined(__aarch64__ )
9321+ return (_Py_vec16u8 )vshrq_n_u8 ((uint8x16_t )v , 4 );
94339322#endif
9434-
9435- /* Scalar tail */
9436- while (i < len ) {
9437- Py_UCS1 ch = in [i ];
9438- out [i ] = reorg [ch >> 4 ][ch & 0x0F ];
9439- i ++ ;
9440- }
94419323}
9442- #endif /* _Py_TRANSLATE_HAVE_AVX2 */
94439324
9444- #ifdef _Py_TRANSLATE_HAVE_NEON
94459325/*
9446- * ARM NEON implementation using tbl instruction for table lookup.
9326+ * Unified SIMD translation - works on both x86 SSSE3+ and ARM NEON.
9327+ * Processes 16 bytes at a time using portable vector operations.
94479328 */
94489329static inline void
9449- _Py_translate_neon (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9330+ _Py_translate_simd (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
94509331 const Py_UCS1 table [128 ])
94519332{
9452- /* Load the table as 8 NEON vectors (16 bytes each) */
9453- uint8x16_t tables [8 ];
9333+ /* Load the 8 lookup sub-tables (16 bytes each, by high nibble ) */
9334+ _Py_vec16u8 tables [8 ];
94549335 for (int i = 0 ; i < 8 ; i ++ ) {
9455- tables [i ] = vld1q_u8 ( & table [i * 16 ]) ;
9336+ tables [i ] = * ( const _Py_vec16u8 * ) & table [i * 16 ];
94569337 }
94579338
9458- uint8x16_t low_nibble_mask = vdupq_n_u8 (0x0F );
9339+ /* Mask for extracting low nibble */
9340+ _Py_vec16u8 nibble_mask = (_Py_vec16u8 ){
9341+ 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F ,
9342+ 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F , 0x0F
9343+ };
94599344
94609345 Py_ssize_t i = 0 ;
94619346
9347+ /* Main SIMD loop - 16 bytes per iteration */
94629348 while (i + 16 <= len ) {
9463- uint8x16_t input = vld1q_u8 (in + i );
9349+ _Py_vec16u8 input = * ( const _Py_vec16u8 * ) (in + i );
94649350
94659351 /* Extract nibbles */
9466- uint8x16_t low_nibble = vandq_u8 (input , low_nibble_mask );
9467- uint8x16_t high_nibble = vandq_u8 (vshrq_n_u8 (input , 4 ), low_nibble_mask );
9468-
9469- /* Perform lookups using vqtbl1q and blend */
9470- uint8x16_t result = vdupq_n_u8 (0 );
9471-
9472- for (int r = 0 ; r < 8 ; r ++ ) {
9473- uint8x16_t lookup = vqtbl1q_u8 (tables [r ], low_nibble );
9474- uint8x16_t row_val = vdupq_n_u8 (r );
9475- uint8x16_t mask = vceqq_u8 (high_nibble , row_val );
9476- result = vorrq_u8 (
9477- vandq_u8 (lookup , mask ),
9478- vbicq_u8 (result , mask )
9479- );
9480- }
9352+ _Py_vec16u8 low_nibble = input & nibble_mask ;
9353+ _Py_vec16u8 high_nibble = _Py_vec_shift_right_4 (input ) & nibble_mask ;
94819354
9482- vst1q_u8 (out + i , result );
9355+ /* Perform lookups and blend based on high nibble */
9356+ _Py_vec16u8 result = (_Py_vec16u8 ){0 };
9357+
9358+ /* Unrolled: lookup in each of 8 sub-tables, blend by high nibble match */
9359+ #define _Py_TRANSLATE_BLEND (r ) do { \
9360+ _Py_vec16u8 lookup = _Py_vec_shuffle(tables[r], low_nibble); \
9361+ _Py_vec16u8 row_bcast = (_Py_vec16u8){r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r}; \
9362+ _Py_vec16i8 mask = (_Py_vec16i8)(high_nibble == row_bcast); \
9363+ result = (lookup & (_Py_vec16u8)mask) | (result & ~(_Py_vec16u8)mask); \
9364+ } while (0)
9365+
9366+ _Py_TRANSLATE_BLEND (0 );
9367+ _Py_TRANSLATE_BLEND (1 );
9368+ _Py_TRANSLATE_BLEND (2 );
9369+ _Py_TRANSLATE_BLEND (3 );
9370+ _Py_TRANSLATE_BLEND (4 );
9371+ _Py_TRANSLATE_BLEND (5 );
9372+ _Py_TRANSLATE_BLEND (6 );
9373+ _Py_TRANSLATE_BLEND (7 );
9374+
9375+ #undef _Py_TRANSLATE_BLEND
9376+
9377+ * (_Py_vec16u8 * )(out + i ) = result ;
94839378 i += 16 ;
94849379 }
94859380
@@ -9489,34 +9384,21 @@ _Py_translate_neon(const Py_UCS1 *in, Py_UCS1 *out, Py_ssize_t len,
94899384 i ++ ;
94909385 }
94919386}
9492- #endif /* _Py_TRANSLATE_HAVE_NEON */
94939387
9494- /*
9495- * Main SIMD dispatch function for ASCII translation.
9496- * Called when the translation table is fully populated with no deletions.
9497- */
9388+ #else /* !_Py_TRANSLATE_HAVE_SIMD */
9389+
9390+ /* Scalar fallback when no SIMD is available */
94989391static inline void
94999392_Py_translate_simd (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
95009393 const Py_UCS1 table [128 ])
95019394{
9502- #if defined(_Py_TRANSLATE_HAVE_AVX2 )
9503- Py_UCS1 reorg [8 ][16 ];
9504- _Py_translate_reorganize_table (table , reorg );
9505- _Py_translate_avx2 (in , out , len , reorg );
9506- #elif defined(_Py_TRANSLATE_HAVE_SSSE3 )
9507- Py_UCS1 reorg [8 ][16 ];
9508- _Py_translate_reorganize_table (table , reorg );
9509- _Py_translate_ssse3 (in , out , len , reorg );
9510- #elif defined(_Py_TRANSLATE_HAVE_NEON )
9511- _Py_translate_neon (in , out , len , table );
9512- #else
9513- /* Scalar fallback */
95149395 for (Py_ssize_t i = 0 ; i < len ; i ++ ) {
95159396 out [i ] = table [in [i ]];
95169397 }
9517- #endif
95189398}
95199399
9400+ #endif /* _Py_TRANSLATE_HAVE_SIMD */
9401+
95209402/* Fast path for ascii => ascii translation. Return 1 if the whole string
95219403 was translated into writer, return 0 if the input string was partially
95229404 translated into writer, raise an exception and return -1 on error. */
@@ -9575,8 +9457,7 @@ unicode_fast_translate(PyObject *input, PyObject *mapping,
95759457 *
95769458 * The check is only done every 64 bytes to minimize overhead.
95779459 */
9578- #if defined(_Py_TRANSLATE_HAVE_AVX2 ) || defined(_Py_TRANSLATE_HAVE_SSSE3 ) || \
9579- defined(_Py_TRANSLATE_HAVE_NEON )
9460+ #ifdef _Py_TRANSLATE_HAVE_SIMD
95809461 if (!has_deletion &&
95819462 (in - in_start ) >= 64 &&
95829463 ((in - in_start ) & 63 ) == 0 && /* Check every 64 bytes */
0 commit comments