diff --git a/src/pxl8_gfx.c b/src/pxl8_gfx.c index 809b149..0054e72 100644 --- a/src/pxl8_gfx.c +++ b/src/pxl8_gfx.c @@ -810,29 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) { i32 count = gfx->zbuffer_width * gfx->zbuffer_height; const f32 far_z = 1e30f; -#if defined(PXL8_SIMD_AVX2) - __m256 far_vec = _mm256_set1_ps(far_z); +#if !defined(PXL8_SIMD_SCALAR) + pxl8_simd_vec_f32 far_vec = pxl8_simd_set1_f32(far_z); i32 i = 0; - for (; i + 7 < count; i += 8) { - _mm256_storeu_ps(&gfx->zbuffer[i], far_vec); - } - for (; i < count; i++) { - gfx->zbuffer[i] = far_z; - } -#elif defined(PXL8_SIMD_SSE2) - __m128 far_vec = _mm_set1_ps(far_z); - i32 i = 0; - for (; i + 3 < count; i += 4) { - _mm_storeu_ps(&gfx->zbuffer[i], far_vec); - } - for (; i < count; i++) { - gfx->zbuffer[i] = far_z; - } -#elif defined(PXL8_SIMD_NEON) - float32x4_t far_vec = vdupq_n_f32(far_z); - i32 i = 0; - for (; i + 3 < count; i += 4) { - vst1q_f32(&gfx->zbuffer[i], far_vec); + for (; i + PXL8_SIMD_WIDTH_F32 <= count; i += PXL8_SIMD_WIDTH_F32) { + pxl8_simd_store_f32(&gfx->zbuffer[i], far_vec); } for (; i < count; i++) { gfx->zbuffer[i] = far_z; diff --git a/src/pxl8_simd.h b/src/pxl8_simd.h index 6d13397..e7cf0b1 100644 --- a/src/pxl8_simd.h +++ b/src/pxl8_simd.h @@ -7,20 +7,24 @@ #define PXL8_SIMD_AVX2 1 #define PXL8_SIMD_WIDTH_U8 32 #define PXL8_SIMD_WIDTH_U32 8 + #define PXL8_SIMD_WIDTH_F32 8 #elif defined(__SSE2__) #include #define PXL8_SIMD_SSE2 1 #define PXL8_SIMD_WIDTH_U8 16 #define PXL8_SIMD_WIDTH_U32 4 + #define PXL8_SIMD_WIDTH_F32 4 #elif defined(__ARM_NEON) #include #define PXL8_SIMD_NEON 1 #define PXL8_SIMD_WIDTH_U8 16 #define PXL8_SIMD_WIDTH_U32 4 + #define PXL8_SIMD_WIDTH_F32 4 #else #define PXL8_SIMD_SCALAR 1 #define PXL8_SIMD_WIDTH_U8 1 #define PXL8_SIMD_WIDTH_U32 1 + #define PXL8_SIMD_WIDTH_F32 1 #endif typedef union { @@ -218,6 +222,32 @@ static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) { return result; } +static inline pxl8_simd_vec_f32 pxl8_simd_set1_f32(f32 value) { + pxl8_simd_vec_f32 result; +#if defined(PXL8_SIMD_AVX2) + result.avx2 = _mm256_set1_ps(value); +#elif defined(PXL8_SIMD_SSE2) + result.sse = _mm_set1_ps(value); +#elif defined(PXL8_SIMD_NEON) + result.neon = vdupq_n_f32(value); +#else + result.f32_array[0] = value; +#endif + return result; +} + +static inline void pxl8_simd_store_f32(f32* dest, pxl8_simd_vec_f32 vec) { +#if defined(PXL8_SIMD_AVX2) + _mm256_storeu_ps(dest, vec.avx2); +#elif defined(PXL8_SIMD_SSE2) + _mm_storeu_ps(dest, vec.sse); +#elif defined(PXL8_SIMD_NEON) + vst1q_f32(dest, vec.neon); +#else + dest[0] = vec.f32_array[0]; +#endif +} + static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) { pxl8_simd_vec_f32 result; #if defined(PXL8_SIMD_AVX2)