From 670cd3c97ebe670b8979eaa4bb9d7e995de1a8db Mon Sep 17 00:00:00 2001 From: asrael Date: Tue, 11 Nov 2025 08:23:05 -0600 Subject: [PATCH] use AVX2 when available, cleanup simd --- src/pxl8_gfx.c | 19 ++++++++++++++----- src/pxl8_simd.h | 34 ++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/pxl8_gfx.c b/src/pxl8_gfx.c index a8a01ab..809b149 100644 --- a/src/pxl8_gfx.c +++ b/src/pxl8_gfx.c @@ -810,11 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) { i32 count = gfx->zbuffer_width * gfx->zbuffer_height; const f32 far_z = 1e30f; -#if defined(PXL8_SIMD_NEON) - float32x4_t far_vec = vdupq_n_f32(far_z); +#if defined(PXL8_SIMD_AVX2) + __m256 far_vec = _mm256_set1_ps(far_z); i32 i = 0; - for (; i + 3 < count; i += 4) { - vst1q_f32(&gfx->zbuffer[i], far_vec); + for (; i + 7 < count; i += 8) { + _mm256_storeu_ps(&gfx->zbuffer[i], far_vec); } for (; i < count; i++) { gfx->zbuffer[i] = far_z; @@ -823,7 +823,16 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) { __m128 far_vec = _mm_set1_ps(far_z); i32 i = 0; for (; i + 3 < count; i += 4) { - _mm_store_ps(&gfx->zbuffer[i], far_vec); + _mm_storeu_ps(&gfx->zbuffer[i], far_vec); + } + for (; i < count; i++) { + gfx->zbuffer[i] = far_z; + } +#elif defined(PXL8_SIMD_NEON) + float32x4_t far_vec = vdupq_n_f32(far_z); + i32 i = 0; + for (; i + 3 < count; i += 4) { + vst1q_f32(&gfx->zbuffer[i], far_vec); } for (; i < count; i++) { gfx->zbuffer[i] = far_z; diff --git a/src/pxl8_simd.h b/src/pxl8_simd.h index 35eaa73..6d13397 100644 --- a/src/pxl8_simd.h +++ b/src/pxl8_simd.h @@ -192,7 +192,7 @@ typedef union { #if defined(PXL8_SIMD_AVX2) __m256 avx2; __m128 sse; -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) __m128 sse; #elif defined(PXL8_SIMD_NEON) float32x4_t neon; @@ -204,7 +204,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) { pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2) result.avx2 = _mm256_set_ps(0, 0, 0, 0, w, z, y, x); -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) result.sse = _mm_set_ps(w, z, y, x); #elif defined(PXL8_SIMD_NEON) f32 data[4] = {x, y, z, w}; @@ -222,7 +222,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2) result.avx2 = _mm256_add_ps(a.avx2, b.avx2); -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) result.sse = _mm_add_ps(a.sse, b.sse); #elif defined(PXL8_SIMD_NEON) result.neon = vaddq_f32(a.neon, b.neon); @@ -236,7 +236,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_sub_f32(pxl8_simd_vec_f32 a, pxl8_simd pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2) result.avx2 = _mm256_sub_ps(a.avx2, b.avx2); -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) result.sse = _mm_sub_ps(a.sse, b.sse); #elif defined(PXL8_SIMD_NEON) result.neon = vsubq_f32(a.neon, b.neon); @@ -250,7 +250,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_mul_f32(pxl8_simd_vec_f32 a, pxl8_simd pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2) result.avx2 = _mm256_mul_ps(a.avx2, b.avx2); -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) result.sse = _mm_mul_ps(a.sse, b.sse); #elif defined(PXL8_SIMD_NEON) result.neon = vmulq_f32(a.neon, b.neon); @@ -264,7 +264,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s) pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2) result.avx2 = _mm256_mul_ps(v.avx2, _mm256_set1_ps(s)); -#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#elif defined(PXL8_SIMD_SSE2) result.sse = _mm_mul_ps(v.sse, _mm_set1_ps(s)); #elif defined(PXL8_SIMD_NEON) result.neon = vmulq_n_f32(v.neon, s); @@ -275,7 +275,16 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s) } static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) { -#if defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#if defined(PXL8_SIMD_AVX2) + __m128 a_low = _mm256_castps256_ps128(a.avx2); + __m128 b_low = _mm256_castps256_ps128(b.avx2); + __m128 mul = _mm_mul_ps(a_low, b_low); + __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3)); + __m128 sums = _mm_add_ps(mul, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +#elif defined(PXL8_SIMD_SSE2) __m128 mul = _mm_mul_ps(a.sse, b.sse); __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3)); __m128 sums = _mm_add_ps(mul, shuf); @@ -295,7 +304,16 @@ static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) { } static inline f32 pxl8_simd_dot4_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) { -#if defined(PXL8_SIMD_SSE2) || defined(__SSE__) +#if defined(PXL8_SIMD_AVX2) + __m128 a_low = _mm256_castps256_ps128(a.avx2); + __m128 b_low = _mm256_castps256_ps128(b.avx2); + __m128 mul = _mm_mul_ps(a_low, b_low); + __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1)); + __m128 sums = _mm_add_ps(mul, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); +#elif defined(PXL8_SIMD_SSE2) __m128 mul = _mm_mul_ps(a.sse, b.sse); __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1)); __m128 sums = _mm_add_ps(mul, shuf);