Skip to content

Commit 315845c

Browse files
committed
Unroll bytes/bytearray translate loop for better pipelining
Optimize bytes.translate and bytearray.translate no-deletion fast path with an unrolled loop that allows better instruction pipelining. The 8× unrolled loop reduces branch overhead and allows the CPU to better pipeline the independent table lookup operations. https://claude.ai/code/session_0142fPYhFLFes4W9Tp6C3BhU
1 parent 5faced6 commit 315845c

2 files changed

Lines changed: 37 additions & 12 deletions

File tree

Objects/bytearrayobject.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,10 +1683,21 @@ bytearray_translate_impl(PyByteArrayObject *self, PyObject *table,
16831683
input = PyByteArray_AS_STRING(input_obj);
16841684

16851685
if (vdel.len == 0 && table_chars != NULL) {
1686-
/* If no deletions are required, use faster code */
1687-
for (i = inlen; --i >= 0; ) {
1688-
c = Py_CHARMASK(*input++);
1689-
*output++ = table_chars[c];
1686+
/* If no deletions are required, use faster code.
1687+
* Unrolling allows better instruction pipelining. */
1688+
Py_ssize_t len8 = inlen & ~((Py_ssize_t)7);
1689+
for (i = 0; i < len8; i += 8) {
1690+
output[i+0] = table_chars[(unsigned char)input[i+0]];
1691+
output[i+1] = table_chars[(unsigned char)input[i+1]];
1692+
output[i+2] = table_chars[(unsigned char)input[i+2]];
1693+
output[i+3] = table_chars[(unsigned char)input[i+3]];
1694+
output[i+4] = table_chars[(unsigned char)input[i+4]];
1695+
output[i+5] = table_chars[(unsigned char)input[i+5]];
1696+
output[i+6] = table_chars[(unsigned char)input[i+6]];
1697+
output[i+7] = table_chars[(unsigned char)input[i+7]];
1698+
}
1699+
for (; i < inlen; i++) {
1700+
output[i] = table_chars[(unsigned char)input[i]];
16901701
}
16911702
goto done;
16921703
}

Objects/bytesobject.c

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,14 +2234,28 @@ bytes_translate_impl(PyBytesObject *self, PyObject *table,
22342234
input = PyBytes_AS_STRING(input_obj);
22352235

22362236
if (dellen == 0 && table_chars != NULL) {
2237-
/* If no deletions are required, use faster code */
2238-
for (i = inlen; --i >= 0; ) {
2239-
c = Py_CHARMASK(*input++);
2240-
if (Py_CHARMASK((*output++ = table_chars[c])) != c)
2241-
changed = 1;
2242-
}
2243-
if (!changed && PyBytes_CheckExact(input_obj)) {
2244-
Py_SETREF(result, Py_NewRef(input_obj));
2237+
/* If no deletions are required, use faster code.
2238+
* Unrolling allows better instruction pipelining. */
2239+
Py_ssize_t len8 = inlen & ~((Py_ssize_t)7);
2240+
for (i = 0; i < len8; i += 8) {
2241+
output[i+0] = table_chars[(unsigned char)input[i+0]];
2242+
output[i+1] = table_chars[(unsigned char)input[i+1]];
2243+
output[i+2] = table_chars[(unsigned char)input[i+2]];
2244+
output[i+3] = table_chars[(unsigned char)input[i+3]];
2245+
output[i+4] = table_chars[(unsigned char)input[i+4]];
2246+
output[i+5] = table_chars[(unsigned char)input[i+5]];
2247+
output[i+6] = table_chars[(unsigned char)input[i+6]];
2248+
output[i+7] = table_chars[(unsigned char)input[i+7]];
2249+
}
2250+
for (; i < inlen; i++) {
2251+
output[i] = table_chars[(unsigned char)input[i]];
2252+
}
2253+
/* Check if anything changed (for returning original object) */
2254+
if (PyBytes_CheckExact(input_obj)) {
2255+
changed = (memcmp(input, output, inlen) != 0);
2256+
if (!changed) {
2257+
Py_SETREF(result, Py_NewRef(input_obj));
2258+
}
22452259
}
22462260
PyBuffer_Release(&del_table_view);
22472261
PyBuffer_Release(&table_view);

0 commit comments

Comments
 (0)