Skip to content

Commit fb4eac2

Browse files
authored
Merge pull request #413 from recp/optimize-inv
WIP: More Optimizations and SIMD fixes for MSVC & ARM
2 parents 3bfd31a + 4d0a0a7 commit fb4eac2

File tree

14 files changed

+498
-126
lines changed

14 files changed

+498
-126
lines changed

docs/source/opt.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ SSE and SSE2 Shuffle Option
7676
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7777
**_mm_shuffle_ps** generates **shufps** instruction even if registers are same.
7878
You can force it to generate **pshufd** instruction by defining
79-
**CGLM_USE_INT_DOMAIN** macro. As default it is not defined.
79+
**CGLM_NO_INT_DOMAIN** macro. As default it is not defined.
8080

8181
SSE3 and SSE4 Dot Product Options
8282
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

include/cglm/mat2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ glm_mat2_scale(mat2 m, float s) {
235235
glmm_store(m[0], wasm_f32x4_mul(wasm_v128_load(m[0]),
236236
wasm_f32x4_splat(s)));
237237
#elif defined( __SSE__ ) || defined( __SSE2__ )
238-
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), _mm_set1_ps(s)));
238+
glmm_store(m[0], _mm_mul_ps(_mm_loadu_ps(m[0]), glmm_set1(s)));
239239
#elif defined(CGLM_NEON_FP)
240240
vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), vdupq_n_f32(s)));
241241
#else

include/cglm/mat3.h

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ glm_mat3_det(mat3 mat) {
334334
d = mat[1][0], e = mat[1][1], f = mat[1][2],
335335
g = mat[2][0], h = mat[2][1], i = mat[2][2];
336336

337-
return a * (e * i - h * f) - d * (b * i - c * h) + g * (b * f - c * e);
337+
return a * (e * i - h * f) - d * (b * i - h * c) + g * (b * f - e * c);
338338
}
339339

340340
/*!
@@ -346,24 +346,22 @@ glm_mat3_det(mat3 mat) {
346346
CGLM_INLINE
347347
void
348348
glm_mat3_inv(mat3 mat, mat3 dest) {
349-
float det;
350349
float a = mat[0][0], b = mat[0][1], c = mat[0][2],
351350
d = mat[1][0], e = mat[1][1], f = mat[1][2],
352-
g = mat[2][0], h = mat[2][1], i = mat[2][2];
353-
354-
dest[0][0] = e * i - f * h;
355-
dest[0][1] = -(b * i - h * c);
356-
dest[0][2] = b * f - e * c;
357-
dest[1][0] = -(d * i - g * f);
358-
dest[1][1] = a * i - c * g;
359-
dest[1][2] = -(a * f - d * c);
360-
dest[2][0] = d * h - g * e;
361-
dest[2][1] = -(a * h - g * b);
362-
dest[2][2] = a * e - b * d;
363-
364-
det = 1.0f / (a * dest[0][0] + b * dest[1][0] + c * dest[2][0]);
365-
366-
glm_mat3_scale(dest, det);
351+
g = mat[2][0], h = mat[2][1], i = mat[2][2],
352+
353+
c1 = e * i - f * h, c2 = d * i - g * f, c3 = d * h - g * e,
354+
idt = 1.0f / (a * c1 - b * c2 + c * c3), ndt = -idt;
355+
356+
dest[0][0] = idt * c1;
357+
dest[0][1] = ndt * (b * i - h * c);
358+
dest[0][2] = idt * (b * f - e * c);
359+
dest[1][0] = ndt * c2;
360+
dest[1][1] = idt * (a * i - g * c);
361+
dest[1][2] = ndt * (a * f - d * c);
362+
dest[2][0] = idt * c3;
363+
dest[2][1] = ndt * (a * h - g * b);
364+
dest[2][2] = idt * (a * e - d * b);
367365
}
368366

369367
/*!

include/cglm/mat4.h

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ void
520520
glm_mat4_transpose_to(mat4 m, mat4 dest) {
521521
#if defined(__wasm__) && defined(__wasm_simd128__)
522522
glm_mat4_transp_wasm(m, dest);
523+
#elif defined(__AVX__)
524+
glm_mat4_transp_avx(m, dest);
523525
#elif defined( __SSE__ ) || defined( __SSE2__ )
524526
glm_mat4_transp_sse2(m, dest);
525527
#elif defined(CGLM_NEON_FP)
@@ -546,6 +548,8 @@ void
546548
glm_mat4_transpose(mat4 m) {
547549
#if defined(__wasm__) && defined(__wasm_simd128__)
548550
glm_mat4_transp_wasm(m, m);
551+
#elif defined(__AVX__)
552+
glm_mat4_transp_avx(m, m);
549553
#elif defined( __SSE__ ) || defined( __SSE2__ )
550554
glm_mat4_transp_sse2(m, m);
551555
#elif defined(CGLM_NEON_FP)
@@ -652,46 +656,37 @@ glm_mat4_inv(mat4 mat, mat4 dest) {
652656
#elif defined(CGLM_NEON_FP)
653657
glm_mat4_inv_neon(mat, dest);
654658
#else
655-
float t[6];
656-
float det;
657659
float a = mat[0][0], b = mat[0][1], c = mat[0][2], d = mat[0][3],
658660
e = mat[1][0], f = mat[1][1], g = mat[1][2], h = mat[1][3],
659661
i = mat[2][0], j = mat[2][1], k = mat[2][2], l = mat[2][3],
660-
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3];
661-
662-
t[0] = k * p - o * l; t[1] = j * p - n * l; t[2] = j * o - n * k;
663-
t[3] = i * p - m * l; t[4] = i * o - m * k; t[5] = i * n - m * j;
664-
665-
dest[0][0] = f * t[0] - g * t[1] + h * t[2];
666-
dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]);
667-
dest[2][0] = e * t[1] - f * t[3] + h * t[5];
668-
dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]);
669-
670-
dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]);
671-
dest[1][1] = a * t[0] - c * t[3] + d * t[4];
672-
dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]);
673-
dest[3][1] = a * t[2] - b * t[4] + c * t[5];
674-
675-
t[0] = g * p - o * h; t[1] = f * p - n * h; t[2] = f * o - n * g;
676-
t[3] = e * p - m * h; t[4] = e * o - m * g; t[5] = e * n - m * f;
677-
678-
dest[0][2] = b * t[0] - c * t[1] + d * t[2];
679-
dest[1][2] =-(a * t[0] - c * t[3] + d * t[4]);
680-
dest[2][2] = a * t[1] - b * t[3] + d * t[5];
681-
dest[3][2] =-(a * t[2] - b * t[4] + c * t[5]);
682-
683-
t[0] = g * l - k * h; t[1] = f * l - j * h; t[2] = f * k - j * g;
684-
t[3] = e * l - i * h; t[4] = e * k - i * g; t[5] = e * j - i * f;
685-
686-
dest[0][3] =-(b * t[0] - c * t[1] + d * t[2]);
687-
dest[1][3] = a * t[0] - c * t[3] + d * t[4];
688-
dest[2][3] =-(a * t[1] - b * t[3] + d * t[5]);
689-
dest[3][3] = a * t[2] - b * t[4] + c * t[5];
690-
691-
det = 1.0f / (a * dest[0][0] + b * dest[1][0]
692-
+ c * dest[2][0] + d * dest[3][0]);
693-
694-
glm_mat4_scale_p(dest, det);
662+
m = mat[3][0], n = mat[3][1], o = mat[3][2], p = mat[3][3],
663+
664+
c1 = k * p - l * o, c2 = c * h - d * g, c3 = i * p - l * m,
665+
c4 = a * h - d * e, c5 = j * p - l * n, c6 = b * h - d * f,
666+
c7 = i * n - j * m, c8 = a * f - b * e, c9 = j * o - k * n,
667+
c10 = b * g - c * f, c11 = i * o - k * m, c12 = a * g - c * e,
668+
669+
idt = 1.0f/(c8*c1+c4*c9+c10*c3+c2*c7-c12*c5-c6*c11), ndt = -idt;
670+
671+
dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
672+
dest[0][1] = (b * c1 - c * c5 + d * c9) * ndt;
673+
dest[0][2] = (n * c2 - o * c6 + p * c10) * idt;
674+
dest[0][3] = (j * c2 - k * c6 + l * c10) * ndt;
675+
676+
dest[1][0] = (e * c1 - g * c3 + h * c11) * ndt;
677+
dest[1][1] = (a * c1 - c * c3 + d * c11) * idt;
678+
dest[1][2] = (m * c2 - o * c4 + p * c12) * ndt;
679+
dest[1][3] = (i * c2 - k * c4 + l * c12) * idt;
680+
681+
dest[2][0] = (e * c5 - f * c3 + h * c7) * idt;
682+
dest[2][1] = (a * c5 - b * c3 + d * c7) * ndt;
683+
dest[2][2] = (m * c6 - n * c4 + p * c8) * idt;
684+
dest[2][3] = (i * c6 - j * c4 + l * c8) * ndt;
685+
686+
dest[3][0] = (e * c9 - f * c11 + g * c7) * ndt;
687+
dest[3][1] = (a * c9 - b * c11 + c * c7) * idt;
688+
dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt;
689+
dest[3][3] = (i * c10 - j * c12 + k * c8) * idt;
695690
#endif
696691
}
697692

include/cglm/simd/arm.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,21 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_
6363
static inline
6464
float32x4_t
6565
glmm_vhadd(float32x4_t v) {
66+
#if CGLM_ARM64
67+
float32x4_t p;
68+
p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
69+
return vpaddq_f32(p, p); /* [t, t, t, t] */;
70+
#else
71+
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
72+
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
73+
#endif
74+
/* TODO: measure speed of this compare to above */
75+
/* return vdupq_n_f32(vaddvq_f32(v)); */
76+
77+
/*
6678
return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
6779
vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
80+
*/
6881
/*
6982
this seems slower:
7083
v = vaddq_f32(v, vrev64q_f32(v));
@@ -108,6 +121,12 @@ glmm_dot(float32x4_t a, float32x4_t b) {
108121
return glmm_hadd(vmulq_f32(a, b));
109122
}
110123

124+
static inline
125+
float32x4_t
126+
glmm_vdot(float32x4_t a, float32x4_t b) {
127+
return glmm_vhadd(vmulq_f32(a, b));
128+
}
129+
111130
static inline
112131
float
113132
glmm_norm(float32x4_t a) {

include/cglm/simd/avx/mat4.h

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,55 @@
1212
#include "../../common.h"
1313
#include "../intrin.h"
1414

15-
#include <immintrin.h>
16-
1715
CGLM_INLINE
1816
void
1917
glm_mat4_scale_avx(mat4 m, float s) {
20-
__m256 y0;
21-
y0 = _mm256_set1_ps(s);
18+
__m256 y0, y1, y2, y3, y4;
19+
20+
y0 = glmm_load256(m[0]); /* h g f e d c b a */
21+
y1 = glmm_load256(m[2]); /* p o n m l k j i */
22+
23+
y2 = _mm256_broadcast_ss(&s);
24+
25+
y3 = _mm256_mul_ps(y0, y2);
26+
y4 = _mm256_mul_ps(y1, y2);
27+
28+
glmm_store256(m[0], y3);
29+
glmm_store256(m[2], y4);
30+
}
31+
32+
/* TODO: this must be tested and compared to SSE version, may be slower!!! */
33+
CGLM_INLINE
34+
void
35+
glm_mat4_transp_avx(mat4 m, mat4 dest) {
36+
__m256 y0, y1, y2, y3;
37+
38+
y0 = glmm_load256(m[0]); /* h g f e d c b a */
39+
y1 = glmm_load256(m[2]); /* p o n m l k j i */
40+
41+
y2 = _mm256_unpacklo_ps(y0, y1); /* n f m e j b i a */
42+
y3 = _mm256_unpackhi_ps(y0, y1); /* p h o g l d k c */
2243

23-
glmm_store256(m[0], _mm256_mul_ps(y0, glmm_load256(m[0])));
24-
glmm_store256(m[2], _mm256_mul_ps(y0, glmm_load256(m[2])));
44+
y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* l d k c j b i a */
45+
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p h o g n f m e */
46+
47+
y2 = _mm256_unpacklo_ps(y0, y1); /* o k g c m i e a */
48+
y3 = _mm256_unpackhi_ps(y0, y1); /* p l h d n j f b */
49+
50+
y0 = _mm256_permute2f128_ps(y2, y3, 0x20); /* n j f b m i e a */
51+
y1 = _mm256_permute2f128_ps(y2, y3, 0x31); /* p l h d o k g c */
52+
53+
glmm_store256(dest[0], y0);
54+
glmm_store256(dest[2], y1);
2555
}
2656

2757
CGLM_INLINE
2858
void
2959
glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
3060
/* D = R * L (Column-Major) */
3161

32-
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
62+
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13;
63+
__m256i yi0, yi1, yi2, yi3;
3364

3465
y0 = glmm_load256(m2[0]); /* h g f e d c b a */
3566
y1 = glmm_load256(m2[2]); /* p o n m l k j i */
@@ -41,35 +72,43 @@ glm_mat4_mul_avx(mat4 m1, mat4 m2, mat4 dest) {
4172
y4 = _mm256_permute2f128_ps(y2, y2, 0x03); /* d c b a h g f e */
4273
y5 = _mm256_permute2f128_ps(y3, y3, 0x03); /* l k j i p o n m */
4374

75+
yi0 = _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0);
76+
yi1 = _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2);
77+
yi2 = _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1);
78+
yi3 = _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3);
79+
4480
/* f f f f a a a a */
4581
/* h h h h c c c c */
4682
/* e e e e b b b b */
4783
/* g g g g d d d d */
48-
y6 = _mm256_permutevar_ps(y0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
49-
y7 = _mm256_permutevar_ps(y0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
50-
y8 = _mm256_permutevar_ps(y0, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
51-
y9 = _mm256_permutevar_ps(y0, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
52-
53-
glmm_store256(dest[0],
54-
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
55-
_mm256_mul_ps(y3, y7)),
56-
_mm256_add_ps(_mm256_mul_ps(y4, y8),
57-
_mm256_mul_ps(y5, y9))));
84+
y6 = _mm256_permutevar_ps(y0, yi0);
85+
y7 = _mm256_permutevar_ps(y0, yi1);
86+
y8 = _mm256_permutevar_ps(y0, yi2);
87+
y9 = _mm256_permutevar_ps(y0, yi3);
5888

5989
/* n n n n i i i i */
6090
/* p p p p k k k k */
6191
/* m m m m j j j j */
6292
/* o o o o l l l l */
63-
y6 = _mm256_permutevar_ps(y1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0));
64-
y7 = _mm256_permutevar_ps(y1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2));
65-
y8 = _mm256_permutevar_ps(y1, _mm256_set_epi32(0, 0, 0, 0, 1, 1, 1, 1));
66-
y9 = _mm256_permutevar_ps(y1, _mm256_set_epi32(2, 2, 2, 2, 3, 3, 3, 3));
67-
68-
glmm_store256(dest[2],
69-
_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(y2, y6),
70-
_mm256_mul_ps(y3, y7)),
71-
_mm256_add_ps(_mm256_mul_ps(y4, y8),
72-
_mm256_mul_ps(y5, y9))));
93+
y10 = _mm256_permutevar_ps(y1, yi0);
94+
y11 = _mm256_permutevar_ps(y1, yi1);
95+
y12 = _mm256_permutevar_ps(y1, yi2);
96+
y13 = _mm256_permutevar_ps(y1, yi3);
97+
98+
y0 = _mm256_mul_ps(y2, y6);
99+
y1 = _mm256_mul_ps(y2, y10);
100+
101+
y0 = glmm256_fmadd(y3, y7, y0);
102+
y1 = glmm256_fmadd(y3, y11, y1);
103+
104+
y0 = glmm256_fmadd(y4, y8, y0);
105+
y1 = glmm256_fmadd(y4, y12, y1);
106+
107+
y0 = glmm256_fmadd(y5, y9, y0);
108+
y1 = glmm256_fmadd(y5, y13, y1);
109+
110+
glmm_store256(dest[0], y0);
111+
glmm_store256(dest[2], y1);
73112
}
74113

75114
#endif

include/cglm/simd/intrin.h

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#ifndef cglm_intrin_h
99
#define cglm_intrin_h
1010

11-
#if defined( _MSC_VER )
11+
#if defined(_MSC_VER) && !defined(_M_ARM64EC)
1212
# if (defined(_M_AMD64) || defined(_M_X64)) || _M_IX86_FP == 2
1313
# ifndef __SSE__
1414
# define __SSE__
@@ -20,13 +20,37 @@
2020
# ifndef __SSE__
2121
# define __SSE__
2222
# endif
23-
#endif
23+
# endif
2424
/* do not use alignment for older visual studio versions */
25-
# if _MSC_VER < 1913 /* Visual Studio 2017 version 15.6 */
25+
/* also ARM32 also causes similar error, disable it for now on ARM32 too */
26+
# if _MSC_VER < 1913 || _M_ARM /* Visual Studio 2017 version 15.6 */
2627
# define CGLM_ALL_UNALIGNED
2728
# endif
2829
#endif
2930

31+
#ifdef __AVX__
32+
# include <immintrin.h>
33+
# define CGLM_AVX_FP 1
34+
# ifndef __SSE2__
35+
# define __SSE2__
36+
# endif
37+
# ifndef __SSE3__
38+
# define __SSE3__
39+
# endif
40+
# ifndef __SSE4__
41+
# define __SSE4__
42+
# endif
43+
# ifndef __SSE4_1__
44+
# define __SSE4_1__
45+
# endif
46+
# ifndef __SSE4_2__
47+
# define __SSE4_2__
48+
# endif
49+
# ifndef CGLM_SIMD_x86
50+
# define CGLM_SIMD_x86
51+
# endif
52+
#endif
53+
3054
#if defined(__SSE__)
3155
# include <xmmintrin.h>
3256
# define CGLM_SSE_FP 1
@@ -64,14 +88,6 @@
6488
# endif
6589
#endif
6690

67-
#ifdef __AVX__
68-
# include <immintrin.h>
69-
# define CGLM_AVX_FP 1
70-
# ifndef CGLM_SIMD_x86
71-
# define CGLM_SIMD_x86
72-
# endif
73-
#endif
74-
7591
/* ARM Neon */
7692
#if defined(_WIN32) && defined(_MSC_VER)
7793
/* TODO: non-ARM stuff already inported, will this be better option */
@@ -100,7 +116,7 @@
100116
#else /* non-windows */
101117
# if defined(__ARM_NEON) || defined(__ARM_NEON__)
102118
# include <arm_neon.h>
103-
# if defined(__ARM_NEON_FP)
119+
# if defined(__ARM_NEON_FP) || defined(__ARM_FP)
104120
# define CGLM_NEON_FP 1
105121
# endif
106122
# ifndef CGLM_SIMD_ARM

0 commit comments

Comments
 (0)