@@ -67,6 +67,24 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
6767#include <windows.h>
6868#endif
6969
70+ /* SIMD support for str.translate optimization */
71+ #if defined(__GNUC__ ) || defined(__clang__ )
72+ # if defined(__x86_64__ )
73+ # include <emmintrin.h> /* SSE2 - baseline for x86-64 */
74+ # ifdef __SSSE3__
75+ # include <tmmintrin.h> /* SSSE3 for pshufb */
76+ # define _Py_TRANSLATE_HAVE_SSSE3 1
77+ # endif
78+ # ifdef __AVX2__
79+ # include <immintrin.h>
80+ # define _Py_TRANSLATE_HAVE_AVX2 1
81+ # endif
82+ # elif defined(__aarch64__ )
83+ # include <arm_neon.h>
84+ # define _Py_TRANSLATE_HAVE_NEON 1
85+ # endif
86+ #endif
87+
7088#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
7189# include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
7290#endif
@@ -9264,6 +9282,241 @@ unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
92649282 return ret ;
92659283}
92669284
9285+ /*
9286+ * SIMD-accelerated ASCII translation helpers.
9287+ *
9288+ * These use the nibble-split lookup technique to perform parallel table
9289+ * lookups using SSSE3 pshufb, AVX2 vpshufb, or NEON tbl instructions.
9290+ *
9291+ * The 128-byte ASCII table is reorganized into 8 rows of 16 bytes each,
9292+ * indexed by the high nibble (bits 6:4) to select the row and the low
9293+ * nibble (bits 3:0) to select the column within the row.
9294+ */
9295+
9296+ #if defined(_Py_TRANSLATE_HAVE_SSSE3 ) || defined(_Py_TRANSLATE_HAVE_AVX2 )
9297+ /* Reorganize table for nibble-split lookup: 8 rows x 16 columns */
9298+ static inline void
9299+ _Py_translate_reorganize_table (const Py_UCS1 table [128 ],
9300+ Py_UCS1 reorg [8 ][16 ])
9301+ {
9302+ for (int row = 0 ; row < 8 ; row ++ ) {
9303+ for (int col = 0 ; col < 16 ; col ++ ) {
9304+ reorg [row ][col ] = table [row * 16 + col ];
9305+ }
9306+ }
9307+ }
9308+ #endif
9309+
9310+ #ifdef _Py_TRANSLATE_HAVE_SSSE3
9311+ /*
9312+ * SSSE3 implementation using pshufb for parallel table lookup.
9313+ * Processes 16 bytes at a time.
9314+ */
9315+ static inline void
9316+ _Py_translate_ssse3 (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9317+ const Py_UCS1 reorg [8 ][16 ])
9318+ {
9319+ /* Load the 8 lookup sub-tables */
9320+ __m128i tables [8 ];
9321+ for (int i = 0 ; i < 8 ; i ++ ) {
9322+ tables [i ] = _mm_loadu_si128 ((const __m128i * )reorg [i ]);
9323+ }
9324+
9325+ __m128i low_nibble_mask = _mm_set1_epi8 (0x0F );
9326+
9327+ Py_ssize_t i = 0 ;
9328+
9329+ /* Main SIMD loop - 16 bytes per iteration */
9330+ while (i + 16 <= len ) {
9331+ __m128i input = _mm_loadu_si128 ((const __m128i * )(in + i ));
9332+
9333+ /* Extract nibbles */
9334+ __m128i low_nibble = _mm_and_si128 (input , low_nibble_mask );
9335+ __m128i high_nibble = _mm_and_si128 (
9336+ _mm_srli_epi16 (input , 4 ), low_nibble_mask );
9337+
9338+ /* Perform lookups and blend based on high nibble */
9339+ __m128i result = _mm_setzero_si128 ();
9340+
9341+ for (int r = 0 ; r < 8 ; r ++ ) {
9342+ __m128i lookup = _mm_shuffle_epi8 (tables [r ], low_nibble );
9343+ __m128i row_val = _mm_set1_epi8 (r );
9344+ __m128i mask = _mm_cmpeq_epi8 (high_nibble , row_val );
9345+ result = _mm_or_si128 (
9346+ _mm_and_si128 (lookup , mask ),
9347+ _mm_andnot_si128 (mask , result )
9348+ );
9349+ }
9350+
9351+ _mm_storeu_si128 ((__m128i * )(out + i ), result );
9352+ i += 16 ;
9353+ }
9354+
9355+ /* Scalar tail */
9356+ while (i < len ) {
9357+ Py_UCS1 ch = in [i ];
9358+ out [i ] = reorg [ch >> 4 ][ch & 0x0F ];
9359+ i ++ ;
9360+ }
9361+ }
9362+ #endif /* _Py_TRANSLATE_HAVE_SSSE3 */
9363+
9364+ #ifdef _Py_TRANSLATE_HAVE_AVX2
9365+ /*
9366+ * AVX2 implementation - processes 32 bytes at a time.
9367+ * Uses the same nibble-split technique with 256-bit vectors.
9368+ */
9369+ static inline void
9370+ _Py_translate_avx2 (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9371+ const Py_UCS1 reorg [8 ][16 ])
9372+ {
9373+ /* Load tables, duplicated for AVX2 */
9374+ __m256i tables [8 ];
9375+ for (int i = 0 ; i < 8 ; i ++ ) {
9376+ __m128i t = _mm_loadu_si128 ((const __m128i * )reorg [i ]);
9377+ tables [i ] = _mm256_broadcastsi128_si256 (t );
9378+ }
9379+
9380+ __m256i low_nibble_mask = _mm256_set1_epi8 (0x0F );
9381+
9382+ Py_ssize_t i = 0 ;
9383+
9384+ /* Main SIMD loop - 32 bytes per iteration */
9385+ while (i + 32 <= len ) {
9386+ __m256i input = _mm256_loadu_si256 ((const __m256i * )(in + i ));
9387+
9388+ __m256i low_nibble = _mm256_and_si256 (input , low_nibble_mask );
9389+ __m256i high_nibble = _mm256_and_si256 (
9390+ _mm256_srli_epi16 (input , 4 ), low_nibble_mask );
9391+
9392+ __m256i result = _mm256_setzero_si256 ();
9393+
9394+ for (int r = 0 ; r < 8 ; r ++ ) {
9395+ __m256i lookup = _mm256_shuffle_epi8 (tables [r ], low_nibble );
9396+ __m256i row_val = _mm256_set1_epi8 (r );
9397+ __m256i mask = _mm256_cmpeq_epi8 (high_nibble , row_val );
9398+ result = _mm256_or_si256 (
9399+ _mm256_and_si256 (lookup , mask ),
9400+ _mm256_andnot_si256 (mask , result )
9401+ );
9402+ }
9403+
9404+ _mm256_storeu_si256 ((__m256i * )(out + i ), result );
9405+ i += 32 ;
9406+ }
9407+
9408+ /* Handle remaining 16+ bytes with SSE */
9409+ #ifdef _Py_TRANSLATE_HAVE_SSSE3
9410+ if (i + 16 <= len ) {
9411+ __m128i tables128 [8 ];
9412+ for (int j = 0 ; j < 8 ; j ++ ) {
9413+ tables128 [j ] = _mm_loadu_si128 ((const __m128i * )reorg [j ]);
9414+ }
9415+
9416+ __m128i input = _mm_loadu_si128 ((const __m128i * )(in + i ));
9417+ __m128i low_nibble = _mm_and_si128 (input , _mm_set1_epi8 (0x0F ));
9418+ __m128i high_nibble = _mm_and_si128 (
9419+ _mm_srli_epi16 (input , 4 ), _mm_set1_epi8 (0x0F ));
9420+
9421+ __m128i result = _mm_setzero_si128 ();
9422+ for (int r = 0 ; r < 8 ; r ++ ) {
9423+ __m128i lookup = _mm_shuffle_epi8 (tables128 [r ], low_nibble );
9424+ __m128i mask = _mm_cmpeq_epi8 (high_nibble , _mm_set1_epi8 (r ));
9425+ result = _mm_or_si128 (
9426+ _mm_and_si128 (lookup , mask ),
9427+ _mm_andnot_si128 (mask , result )
9428+ );
9429+ }
9430+ _mm_storeu_si128 ((__m128i * )(out + i ), result );
9431+ i += 16 ;
9432+ }
9433+ #endif
9434+
9435+ /* Scalar tail */
9436+ while (i < len ) {
9437+ Py_UCS1 ch = in [i ];
9438+ out [i ] = reorg [ch >> 4 ][ch & 0x0F ];
9439+ i ++ ;
9440+ }
9441+ }
9442+ #endif /* _Py_TRANSLATE_HAVE_AVX2 */
9443+
9444+ #ifdef _Py_TRANSLATE_HAVE_NEON
9445+ /*
9446+ * ARM NEON implementation using tbl instruction for table lookup.
9447+ */
9448+ static inline void
9449+ _Py_translate_neon (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9450+ const Py_UCS1 table [128 ])
9451+ {
9452+ /* Load the table as 8 NEON vectors (16 bytes each) */
9453+ uint8x16_t tables [8 ];
9454+ for (int i = 0 ; i < 8 ; i ++ ) {
9455+ tables [i ] = vld1q_u8 (& table [i * 16 ]);
9456+ }
9457+
9458+ uint8x16_t low_nibble_mask = vdupq_n_u8 (0x0F );
9459+
9460+ Py_ssize_t i = 0 ;
9461+
9462+ while (i + 16 <= len ) {
9463+ uint8x16_t input = vld1q_u8 (in + i );
9464+
9465+ /* Extract nibbles */
9466+ uint8x16_t low_nibble = vandq_u8 (input , low_nibble_mask );
9467+ uint8x16_t high_nibble = vandq_u8 (vshrq_n_u8 (input , 4 ), low_nibble_mask );
9468+
9469+ /* Perform lookups using vqtbl1q and blend */
9470+ uint8x16_t result = vdupq_n_u8 (0 );
9471+
9472+ for (int r = 0 ; r < 8 ; r ++ ) {
9473+ uint8x16_t lookup = vqtbl1q_u8 (tables [r ], low_nibble );
9474+ uint8x16_t row_val = vdupq_n_u8 (r );
9475+ uint8x16_t mask = vceqq_u8 (high_nibble , row_val );
9476+ result = vorrq_u8 (
9477+ vandq_u8 (lookup , mask ),
9478+ vbicq_u8 (result , mask )
9479+ );
9480+ }
9481+
9482+ vst1q_u8 (out + i , result );
9483+ i += 16 ;
9484+ }
9485+
9486+ /* Scalar tail */
9487+ while (i < len ) {
9488+ out [i ] = table [in [i ]];
9489+ i ++ ;
9490+ }
9491+ }
9492+ #endif /* _Py_TRANSLATE_HAVE_NEON */
9493+
9494+ /*
9495+ * Main SIMD dispatch function for ASCII translation.
9496+ * Called when the translation table is fully populated with no deletions.
9497+ */
9498+ static inline void
9499+ _Py_translate_simd (const Py_UCS1 * in , Py_UCS1 * out , Py_ssize_t len ,
9500+ const Py_UCS1 table [128 ])
9501+ {
9502+ #if defined(_Py_TRANSLATE_HAVE_AVX2 )
9503+ Py_UCS1 reorg [8 ][16 ];
9504+ _Py_translate_reorganize_table (table , reorg );
9505+ _Py_translate_avx2 (in , out , len , reorg );
9506+ #elif defined(_Py_TRANSLATE_HAVE_SSSE3 )
9507+ Py_UCS1 reorg [8 ][16 ];
9508+ _Py_translate_reorganize_table (table , reorg );
9509+ _Py_translate_ssse3 (in , out , len , reorg );
9510+ #elif defined(_Py_TRANSLATE_HAVE_NEON )
9511+ _Py_translate_neon (in , out , len , table );
9512+ #else
9513+ /* Scalar fallback */
9514+ for (Py_ssize_t i = 0 ; i < len ; i ++ ) {
9515+ out [i ] = table [in [i ]];
9516+ }
9517+ #endif
9518+ }
9519+
92679520/* Fast path for ascii => ascii translation. Return 1 if the whole string
92689521 was translated into writer, return 0 if the input string was partially
92699522 translated into writer, raise an exception and return -1 on error. */
@@ -9274,15 +9527,17 @@ unicode_fast_translate(PyObject *input, PyObject *mapping,
92749527{
92759528 Py_UCS1 ascii_table [128 ], ch , ch2 ;
92769529 Py_ssize_t len ;
9277- const Py_UCS1 * in , * end ;
9530+ const Py_UCS1 * in , * end , * in_start ;
92789531 Py_UCS1 * out ;
92799532 int res = 0 ;
9533+ int has_deletion = 0 ; /* Track if any 0xfe (deletion) markers seen */
92809534
92819535 len = PyUnicode_GET_LENGTH (input );
92829536
92839537 memset (ascii_table , 0xff , 128 );
92849538
92859539 in = PyUnicode_1BYTE_DATA (input );
9540+ in_start = in ;
92869541 end = in + len ;
92879542
92889543 assert (PyUnicode_IS_ASCII (writer -> buffer ));
@@ -9302,13 +9557,50 @@ unicode_fast_translate(PyObject *input, PyObject *mapping,
93029557 ch2 = ascii_table [ch ];
93039558 }
93049559 if (ch2 == 0xfe ) {
9560+ has_deletion = 1 ;
93059561 if (ignore )
93069562 continue ;
93079563 goto exit ;
93089564 }
93099565 assert (ch2 < 128 );
93109566 * out = ch2 ;
93119567 out ++ ;
9568+
9569+ /*
9570+ * SIMD optimization: After processing 64+ bytes without hitting
9571+ * any deletion markers, check if we can switch to SIMD for the
9572+ * remaining data. This requires:
9573+ * 1. No deletions in the translation
9574+ * 2. All remaining characters are in the already-populated table
9575+ *
9576+ * The check is only done every 64 bytes to minimize overhead.
9577+ */
9578+ #if defined(_Py_TRANSLATE_HAVE_AVX2 ) || defined(_Py_TRANSLATE_HAVE_SSSE3 ) || \
9579+ defined(_Py_TRANSLATE_HAVE_NEON )
9580+ if (!has_deletion &&
9581+ (in - in_start ) >= 64 &&
9582+ ((in - in_start ) & 63 ) == 0 && /* Check every 64 bytes */
9583+ (end - in ) >= 32 ) /* At least 32 bytes remaining */
9584+ {
9585+ /* Check if all remaining characters are already in the table */
9586+ const Py_UCS1 * check = in + 1 ;
9587+ int can_use_simd = 1 ;
9588+ while (check < end && can_use_simd ) {
9589+ if (ascii_table [* check ] >= 0xfe ) {
9590+ can_use_simd = 0 ;
9591+ }
9592+ check ++ ;
9593+ }
9594+
9595+ if (can_use_simd ) {
9596+ /* All remaining chars are known - use SIMD for the rest */
9597+ Py_ssize_t remaining = end - in - 1 ;
9598+ _Py_translate_simd (in + 1 , out , remaining , ascii_table );
9599+ out += remaining ;
9600+ in = end - 1 ; /* Will be incremented by loop */
9601+ }
9602+ }
9603+ #endif
93129604 }
93139605 res = 1 ;
93149606
0 commit comments