use AVX2 when available, cleanup simd

This commit is contained in:
asrael 2025-11-11 08:23:05 -06:00
parent 2abc6c9486
commit 670cd3c97e
2 changed files with 40 additions and 13 deletions

View file

@ -810,11 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
const f32 far_z = 1e30f;
#if defined(PXL8_SIMD_NEON)
float32x4_t far_vec = vdupq_n_f32(far_z);
#if defined(PXL8_SIMD_AVX2)
__m256 far_vec = _mm256_set1_ps(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
vst1q_f32(&gfx->zbuffer[i], far_vec);
for (; i + 7 < count; i += 8) {
_mm256_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
@ -823,7 +823,16 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
__m128 far_vec = _mm_set1_ps(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
_mm_store_ps(&gfx->zbuffer[i], far_vec);
_mm_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
}
#elif defined(PXL8_SIMD_NEON)
float32x4_t far_vec = vdupq_n_f32(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
vst1q_f32(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;

View file

@ -192,7 +192,7 @@ typedef union {
#if defined(PXL8_SIMD_AVX2)
__m256 avx2;
__m128 sse;
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
__m128 sse;
#elif defined(PXL8_SIMD_NEON)
float32x4_t neon;
@ -204,7 +204,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_set_ps(0, 0, 0, 0, w, z, y, x);
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_set_ps(w, z, y, x);
#elif defined(PXL8_SIMD_NEON)
f32 data[4] = {x, y, z, w};
@ -222,7 +222,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_add_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_add_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vaddq_f32(a.neon, b.neon);
@ -236,7 +236,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_sub_f32(pxl8_simd_vec_f32 a, pxl8_simd
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_sub_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_sub_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vsubq_f32(a.neon, b.neon);
@ -250,7 +250,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_mul_f32(pxl8_simd_vec_f32 a, pxl8_simd
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_mul_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_mul_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vmulq_f32(a.neon, b.neon);
@ -264,7 +264,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s)
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_mul_ps(v.avx2, _mm256_set1_ps(s));
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_mul_ps(v.sse, _mm_set1_ps(s));
#elif defined(PXL8_SIMD_NEON)
result.neon = vmulq_n_f32(v.neon, s);
@ -275,7 +275,16 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s)
}
static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
#if defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#if defined(PXL8_SIMD_AVX2)
__m128 a_low = _mm256_castps256_ps128(a.avx2);
__m128 b_low = _mm256_castps256_ps128(b.avx2);
__m128 mul = _mm_mul_ps(a_low, b_low);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_SSE2)
__m128 mul = _mm_mul_ps(a.sse, b.sse);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
__m128 sums = _mm_add_ps(mul, shuf);
@ -295,7 +304,16 @@ static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
}
static inline f32 pxl8_simd_dot4_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
#if defined(PXL8_SIMD_SSE2) || defined(__SSE__)
#if defined(PXL8_SIMD_AVX2)
__m128 a_low = _mm256_castps256_ps128(a.avx2);
__m128 b_low = _mm256_castps256_ps128(b.avx2);
__m128 mul = _mm_mul_ps(a_low, b_low);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_SSE2)
__m128 mul = _mm_mul_ps(a.sse, b.sse);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);