use AVX2 when available, cleanup simd

This commit is contained in:
asrael 2025-11-11 08:23:05 -06:00
parent 2abc6c9486
commit 670cd3c97e
2 changed files with 40 additions and 13 deletions

View file

@ -810,11 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
const f32 far_z = 1e30f;
#if defined(PXL8_SIMD_NEON)
float32x4_t far_vec = vdupq_n_f32(far_z);
#if defined(PXL8_SIMD_AVX2)
__m256 far_vec = _mm256_set1_ps(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
vst1q_f32(&gfx->zbuffer[i], far_vec);
for (; i + 7 < count; i += 8) {
_mm256_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
@ -823,7 +823,16 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
__m128 far_vec = _mm_set1_ps(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
_mm_store_ps(&gfx->zbuffer[i], far_vec);
_mm_storeu_ps(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
}
#elif defined(PXL8_SIMD_NEON)
float32x4_t far_vec = vdupq_n_f32(far_z);
i32 i = 0;
for (; i + 3 < count; i += 4) {
vst1q_f32(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;