1212#include "../../common.h"
1313#include "../intrin.h"
1414
15- #include <immintrin.h>
16-
1715CGLM_INLINE
1816void
1917glm_mat4_scale_avx (mat4 m , float s ) {
20- __m256 y0 ;
21- y0 = _mm256_set1_ps (s );
18+ __m256 y0 , y1 , y2 , y3 , y4 ;
19+
20+ y0 = glmm_load256 (m [0 ]); /* h g f e d c b a */
21+ y1 = glmm_load256 (m [2 ]); /* p o n m l k j i */
22+
23+ y2 = _mm256_broadcast_ss (& s );
24+
25+ y3 = _mm256_mul_ps (y0 , y2 );
26+ y4 = _mm256_mul_ps (y1 , y2 );
27+
28+ glmm_store256 (m [0 ], y3 );
29+ glmm_store256 (m [2 ], y4 );
30+ }
31+
32+ /* TODO: this must be tested and compared to SSE version, may be slower!!! */
33+ CGLM_INLINE
34+ void
35+ glm_mat4_transp_avx (mat4 m , mat4 dest ) {
36+ __m256 y0 , y1 , y2 , y3 ;
37+
38+ y0 = glmm_load256 (m [0 ]); /* h g f e d c b a */
39+ y1 = glmm_load256 (m [2 ]); /* p o n m l k j i */
40+
41+ y2 = _mm256_unpacklo_ps (y0 , y1 ); /* n f m e j b i a */
42+ y3 = _mm256_unpackhi_ps (y0 , y1 ); /* p h o g l d k c */
2243
23- glmm_store256 (m [0 ], _mm256_mul_ps (y0 , glmm_load256 (m [0 ])));
24- glmm_store256 (m [2 ], _mm256_mul_ps (y0 , glmm_load256 (m [2 ])));
44+ y0 = _mm256_permute2f128_ps (y2 , y3 , 0x20 ); /* l d k c j b i a */
45+ y1 = _mm256_permute2f128_ps (y2 , y3 , 0x31 ); /* p h o g n f m e */
46+
47+ y2 = _mm256_unpacklo_ps (y0 , y1 ); /* o k g c m i e a */
48+ y3 = _mm256_unpackhi_ps (y0 , y1 ); /* p l h d n j f b */
49+
50+ y0 = _mm256_permute2f128_ps (y2 , y3 , 0x20 ); /* n j f b m i e a */
51+ y1 = _mm256_permute2f128_ps (y2 , y3 , 0x31 ); /* p l h d o k g c */
52+
53+ glmm_store256 (dest [0 ], y0 );
54+ glmm_store256 (dest [2 ], y1 );
2555}
2656
2757CGLM_INLINE
2858void
2959glm_mat4_mul_avx (mat4 m1 , mat4 m2 , mat4 dest ) {
3060 /* D = R * L (Column-Major) */
3161
32- __m256 y0 , y1 , y2 , y3 , y4 , y5 , y6 , y7 , y8 , y9 ;
62+ __m256 y0 , y1 , y2 , y3 , y4 , y5 , y6 , y7 , y8 , y9 , y10 , y11 , y12 , y13 ;
63+ __m256i yi0 , yi1 , yi2 , yi3 ;
3364
3465 y0 = glmm_load256 (m2 [0 ]); /* h g f e d c b a */
3566 y1 = glmm_load256 (m2 [2 ]); /* p o n m l k j i */
@@ -41,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
4172 y4 = _mm256_permute2f128_ps (y2 , y2 , 0x03 ); /* d c b a h g f e */
4273 y5 = _mm256_permute2f128_ps (y3 , y3 , 0x03 ); /* l k j i p o n m */
4374
75+ yi0 = _mm256_set_epi32 (1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 );
76+ yi1 = _mm256_set_epi32 (3 , 3 , 3 , 3 , 2 , 2 , 2 , 2 );
77+ yi2 = _mm256_set_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 );
78+ yi3 = _mm256_set_epi32 (2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 );
79+
4480 /* f f f f a a a a */
4581 /* h h h h c c c c */
4682 /* e e e e b b b b */
4783 /* g g g g d d d d */
48- y6 = _mm256_permutevar_ps (y0 , _mm256_set_epi32 (1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ));
49- y7 = _mm256_permutevar_ps (y0 , _mm256_set_epi32 (3 , 3 , 3 , 3 , 2 , 2 , 2 , 2 ));
50- y8 = _mm256_permutevar_ps (y0 , _mm256_set_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ));
51- y9 = _mm256_permutevar_ps (y0 , _mm256_set_epi32 (2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ));
52-
53- glmm_store256 (dest [0 ],
54- _mm256_add_ps (_mm256_add_ps (_mm256_mul_ps (y2 , y6 ),
55- _mm256_mul_ps (y3 , y7 )),
56- _mm256_add_ps (_mm256_mul_ps (y4 , y8 ),
57- _mm256_mul_ps (y5 , y9 ))));
84+ y6 = _mm256_permutevar_ps (y0 , yi0 );
85+ y7 = _mm256_permutevar_ps (y0 , yi1 );
86+ y8 = _mm256_permutevar_ps (y0 , yi2 );
87+ y9 = _mm256_permutevar_ps (y0 , yi3 );
5888
5989 /* n n n n i i i i */
6090 /* p p p p k k k k */
6191 /* m m m m j j j j */
6292 /* o o o o l l l l */
63- y6 = _mm256_permutevar_ps (y1 , _mm256_set_epi32 (1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 ));
64- y7 = _mm256_permutevar_ps (y1 , _mm256_set_epi32 (3 , 3 , 3 , 3 , 2 , 2 , 2 , 2 ));
65- y8 = _mm256_permutevar_ps (y1 , _mm256_set_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ));
66- y9 = _mm256_permutevar_ps (y1 , _mm256_set_epi32 (2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ));
67-
68- glmm_store256 (dest [2 ],
69- _mm256_add_ps (_mm256_add_ps (_mm256_mul_ps (y2 , y6 ),
70- _mm256_mul_ps (y3 , y7 )),
71- _mm256_add_ps (_mm256_mul_ps (y4 , y8 ),
72- _mm256_mul_ps (y5 , y9 ))));
93+ y10 = _mm256_permutevar_ps (y1 , yi0 );
94+ y11 = _mm256_permutevar_ps (y1 , yi1 );
95+ y12 = _mm256_permutevar_ps (y1 , yi2 );
96+ y13 = _mm256_permutevar_ps (y1 , yi3 );
97+
98+ y0 = _mm256_mul_ps (y2 , y6 );
99+ y1 = _mm256_mul_ps (y2 , y10 );
100+
101+ y0 = glmm256_fmadd (y3 , y7 , y0 );
102+ y1 = glmm256_fmadd (y3 , y11 , y1 );
103+
104+ y0 = glmm256_fmadd (y4 , y8 , y0 );
105+ y1 = glmm256_fmadd (y4 , y12 , y1 );
106+
107+ y0 = glmm256_fmadd (y5 , y9 , y0 );
108+ y1 = glmm256_fmadd (y5 , y13 , y1 );
109+
110+ glmm_store256 (dest [0 ], y0 );
111+ glmm_store256 (dest [2 ], y1 );
73112}
74113
75114#endif
0 commit comments