1616#include "pyconfig.h"
1717#include "Python.h"
1818#include "hashlib.h"
19- #include "pycore_strhex.h" // _Py_strhex()
19+ #include "pycore_cpuinfo.h" // py_cpuid_features
20+ #include "pycore_strhex.h" // _Py_strhex()
2021#include "pycore_typeobject.h"
2122#include "pycore_moduleobject.h"
2223
23- // QUICK CPU AUTODETECTION
24- //
25- // See https://github.com/python/cpython/pull/119316 -- we only enable
26- // vectorized versions for Intel CPUs, even though HACL*'s "vec128" modules also
27- // run on ARM NEON. (We could enable them on POWER -- but I don't have access to
28- // a test machine to see if that speeds anything up.)
29- //
30- // Note that configure.ac and the rest of the build are written in such a way
31- // that if the configure script finds suitable flags to compile HACL's SIMD128
32- // (resp. SIMD256) files, then Hacl_Hash_Blake2b_Simd128.c (resp. ...) will be
33- // pulled into the build automatically, and then only the CPU autodetection will
34- // need to be updated here.
35-
36- #if defined(__x86_64__ ) && defined(__GNUC__ )
37- #include <cpuid.h>
38- #elif defined(_M_X64 )
39- #include <intrin.h>
40- #endif
41-
4224#include <stdbool.h>
4325
4426// SIMD256 can't be compiled on macOS ARM64, and performance of SIMD128 isn't
5133# undef HACL_CAN_COMPILE_SIMD256
5234#endif
5335
54- // ECX
55- #define ECX_SSE3 (1 << 0)
56- #define ECX_SSSE3 (1 << 9)
57- #define ECX_SSE4_1 (1 << 19)
58- #define ECX_SSE4_2 (1 << 20)
59- #define ECX_AVX (1 << 28)
60-
61- // EBX
62- #define EBX_AVX2 (1 << 5)
63-
64- // EDX
65- #define EDX_SSE (1 << 25)
66- #define EDX_SSE2 (1 << 26)
67- #define EDX_CMOV (1 << 15)
68-
69- // zero-initialized by default
70- typedef struct {
71- bool sse , sse2 , sse3 , sse41 , sse42 , cmov , avx , avx2 ;
72- bool done ;
73- } cpu_flags ;
74-
75- void detect_cpu_features (cpu_flags * flags ) {
76- if (!flags -> done ) {
77- int eax1 = 0 , ebx1 = 0 , ecx1 = 0 , edx1 = 0 ;
78- int eax7 = 0 , ebx7 = 0 , ecx7 = 0 , edx7 = 0 ;
79- #if defined(__x86_64__ ) && defined(__GNUC__ )
80- __cpuid_count (1 , 0 , eax1 , ebx1 , ecx1 , edx1 );
81- __cpuid_count (7 , 0 , eax7 , ebx7 , ecx7 , edx7 );
82- #elif defined(_M_X64 )
83- int info1 [4 ] = { 0 };
84- int info7 [4 ] = { 0 };
85- __cpuidex (info1 , 1 , 0 );
86- __cpuidex (info7 , 7 , 0 );
87- eax1 = info1 [0 ];
88- ebx1 = info1 [1 ];
89- ecx1 = info1 [2 ];
90- edx1 = info1 [3 ];
91- eax7 = info7 [0 ];
92- ebx7 = info7 [1 ];
93- ecx7 = info7 [2 ];
94- edx7 = info7 [3 ];
95- #endif
96- (void ) eax1 ; (void ) ebx1 ; (void ) ecx1 ; (void ) edx1 ;
97- (void ) eax7 ; (void ) ebx7 ; (void ) ecx7 ; (void ) edx7 ;
98-
99-
100- flags -> avx = (ecx1 & ECX_AVX ) != 0 ;
101-
102- flags -> avx2 = (ebx7 & EBX_AVX2 ) != 0 ;
103-
104- flags -> sse = (edx1 & EDX_SSE ) != 0 ;
105- flags -> sse2 = (edx1 & EDX_SSE2 ) != 0 ;
106- flags -> cmov = (edx1 & EDX_CMOV ) != 0 ;
107-
108- flags -> sse3 = (ecx1 & ECX_SSE3 ) != 0 ;
109- /* ssse3 = (ecx1 & ECX_SSSE3) != 0; */
110- flags -> sse41 = (ecx1 & ECX_SSE4_1 ) != 0 ;
111- flags -> sse42 = (ecx1 & ECX_SSE4_2 ) != 0 ;
112-
113- flags -> done = true;
114- }
115- }
116-
117- #ifdef HACL_CAN_COMPILE_SIMD128
118- static inline bool has_simd128 (cpu_flags * flags ) {
119- // For now this is Intel-only, could conceivably be #ifdef'd to something
120- // else.
121- return flags -> sse && flags -> sse2 && flags -> sse3 && flags -> sse41 && flags -> sse42 && flags -> cmov ;
122- }
123- #endif
124-
125- #ifdef HACL_CAN_COMPILE_SIMD256
126- static inline bool has_simd256 (cpu_flags * flags ) {
127- return flags -> avx && flags -> avx2 ;
128- }
129- #endif
130-
13136// Small mismatch between the variable names Python defines as part of configure
13237// at the ones HACL* expects to be set in order to enable those headers.
13338#define HACL_CAN_COMPILE_VEC128 HACL_CAN_COMPILE_SIMD128
@@ -154,9 +59,31 @@ PyDoc_STRVAR(blake2mod__doc__,
15459typedef struct {
15560 PyTypeObject * blake2b_type ;
15661 PyTypeObject * blake2s_type ;
157- cpu_flags flags ;
62+
63+ bool can_run_simd128 ;
64+ bool can_run_simd256 ;
15865} Blake2State ;
15966
67+ static void
68+ blake2_init_cpu_features (Blake2State * state )
69+ {
70+ py_cpuid_features flags ;
71+ _Py_cpuid_detect_features (& flags );
72+ #if HACL_CAN_COMPILE_SIMD128
73+ state -> can_run_simd128 = flags .sse && flags .sse2 && flags .sse3
74+ && flags .sse41 && flags .sse42
75+ && flags .cmov ;
76+ #else
77+ state -> can_run_simd128 = false;
78+ #endif
79+
80+ #if HACL_CAN_COMPILE_SIMD256
81+ state -> can_run_simd256 = flags .avx && flags .avx2 ;
82+ #else
83+ state -> can_run_simd256 = false;
84+ #endif
85+ }
86+
16087static inline Blake2State *
16188blake2_get_state (PyObject * module )
16289{
@@ -224,10 +151,7 @@ static int
224151blake2_exec (PyObject * m )
225152{
226153 Blake2State * st = blake2_get_state (m );
227-
228- // This is called at module initialization-time, and so appears to be as
229- // good a place as any to probe the CPU flags.
230- detect_cpu_features (& st -> flags );
154+ blake2_init_cpu_features (st );
231155
232156 st -> blake2b_type = (PyTypeObject * )PyType_FromModuleAndSpec (
233157 m , & blake2b_type_spec , NULL );
@@ -332,14 +256,14 @@ static inline blake2_impl type_to_impl(PyTypeObject *type) {
332256#endif
333257 if (!strcmp (type -> tp_name , blake2b_type_spec .name )) {
334258#ifdef HACL_CAN_COMPILE_SIMD256
335- if (has_simd256 ( & st -> flags ) )
259+ if (st -> can_run_simd256 )
336260 return Blake2b_256 ;
337261 else
338262#endif
339263 return Blake2b ;
340264 } else if (!strcmp (type -> tp_name , blake2s_type_spec .name )) {
341265#ifdef HACL_CAN_COMPILE_SIMD128
342- if (has_simd128 ( & st -> flags ) )
266+ if (st -> can_run_simd128 )
343267 return Blake2s_128 ;
344268 else
345269#endif
0 commit comments