cleanup some more f32 simd

This commit is contained in:
asrael 2025-11-11 09:27:29 -06:00
parent 670cd3c97e
commit e2c7998663
2 changed files with 34 additions and 22 deletions

View file

@ -810,29 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
const f32 far_z = 1e30f;
#if defined(PXL8_SIMD_AVX2)
__m256 far_vec = _mm256_set1_ps(far_z);
#if !defined(PXL8_SIMD_SCALAR)
pxl8_simd_vec_f32 far_vec = pxl8_simd_set1_f32(far_z);
i32 i = 0;
for (; i + 7 < count; i += 8) {
_mm256_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
}
#elif defined(PXL8_SIMD_SSE2)
__m128 far_vec = _mm_set1_ps(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
_mm_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
}
#elif defined(PXL8_SIMD_NEON)
float32x4_t far_vec = vdupq_n_f32(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
vst1q_f32(&gfx->zbuffer[i], far_vec);
for (; i + PXL8_SIMD_WIDTH_F32 <= count; i += PXL8_SIMD_WIDTH_F32) {
pxl8_simd_store_f32(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;

View file

@ -7,20 +7,24 @@
#define PXL8_SIMD_AVX2 1
#define PXL8_SIMD_WIDTH_U8 32
#define PXL8_SIMD_WIDTH_U32 8
#define PXL8_SIMD_WIDTH_F32 8
#elif defined(__SSE2__)
#include <emmintrin.h>
#define PXL8_SIMD_SSE2 1
#define PXL8_SIMD_WIDTH_U8 16
#define PXL8_SIMD_WIDTH_U32 4
#define PXL8_SIMD_WIDTH_F32 4
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#define PXL8_SIMD_NEON 1
#define PXL8_SIMD_WIDTH_U8 16
#define PXL8_SIMD_WIDTH_U32 4
#define PXL8_SIMD_WIDTH_F32 4
#else
#define PXL8_SIMD_SCALAR 1
#define PXL8_SIMD_WIDTH_U8 1
#define PXL8_SIMD_WIDTH_U32 1
#define PXL8_SIMD_WIDTH_F32 1
#endif
typedef union {
@ -218,6 +222,32 @@ static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) {
return result;
}
static inline pxl8_simd_vec_f32 pxl8_simd_set1_f32(f32 value) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_set1_ps(value);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_set1_ps(value);
#elif defined(PXL8_SIMD_NEON)
result.neon = vdupq_n_f32(value);
#else
result.f32_array[0] = value;
#endif
return result;
}
static inline void pxl8_simd_store_f32(f32* dest, pxl8_simd_vec_f32 vec) {
#if defined(PXL8_SIMD_AVX2)
_mm256_storeu_ps(dest, vec.avx2);
#elif defined(PXL8_SIMD_SSE2)
_mm_storeu_ps(dest, vec.sse);
#elif defined(PXL8_SIMD_NEON)
vst1q_f32(dest, vec.neon);
#else
dest[0] = vec.f32_array[0];
#endif
}
static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)