use AVX2 when available, cleanup simd
This commit is contained in:
parent
2abc6c9486
commit
670cd3c97e
2 changed files with 40 additions and 13 deletions
|
|
@ -810,11 +810,11 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
|
|||
i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
|
||||
const f32 far_z = 1e30f;
|
||||
|
||||
#if defined(PXL8_SIMD_NEON)
|
||||
float32x4_t far_vec = vdupq_n_f32(far_z);
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m256 far_vec = _mm256_set1_ps(far_z);
|
||||
i32 i = 0;
|
||||
for (; i + 3 < count; i += 4) {
|
||||
vst1q_f32(&gfx->zbuffer[i], far_vec);
|
||||
for (; i + 7 < count; i += 8) {
|
||||
_mm256_storeu_ps(&gfx->zbuffer[i], far_vec);
|
||||
}
|
||||
for (; i < count; i++) {
|
||||
gfx->zbuffer[i] = far_z;
|
||||
|
|
@ -823,7 +823,16 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
|
|||
__m128 far_vec = _mm_set1_ps(far_z);
|
||||
i32 i = 0;
|
||||
for (; i + 3 < count; i += 4) {
|
||||
_mm_store_ps(&gfx->zbuffer[i], far_vec);
|
||||
_mm_storeu_ps(&gfx->zbuffer[i], far_vec);
|
||||
}
|
||||
for (; i < count; i++) {
|
||||
gfx->zbuffer[i] = far_z;
|
||||
}
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
float32x4_t far_vec = vdupq_n_f32(far_z);
|
||||
i32 i = 0;
|
||||
for (; i + 3 < count; i += 4) {
|
||||
vst1q_f32(&gfx->zbuffer[i], far_vec);
|
||||
}
|
||||
for (; i < count; i++) {
|
||||
gfx->zbuffer[i] = far_z;
|
||||
|
|
|
|||
|
|
@ -192,7 +192,7 @@ typedef union {
|
|||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m256 avx2;
|
||||
__m128 sse;
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 sse;
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
float32x4_t neon;
|
||||
|
|
@ -204,7 +204,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) {
|
|||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_set_ps(0, 0, 0, 0, w, z, y, x);
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_set_ps(w, z, y, x);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
f32 data[4] = {x, y, z, w};
|
||||
|
|
@ -222,7 +222,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd
|
|||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_add_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_add_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vaddq_f32(a.neon, b.neon);
|
||||
|
|
@ -236,7 +236,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_sub_f32(pxl8_simd_vec_f32 a, pxl8_simd
|
|||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_sub_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_sub_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vsubq_f32(a.neon, b.neon);
|
||||
|
|
@ -250,7 +250,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_mul_f32(pxl8_simd_vec_f32 a, pxl8_simd
|
|||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_mul_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_mul_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vmulq_f32(a.neon, b.neon);
|
||||
|
|
@ -264,7 +264,7 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s)
|
|||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_mul_ps(v.avx2, _mm256_set1_ps(s));
|
||||
#elif defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_mul_ps(v.sse, _mm_set1_ps(s));
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vmulq_n_f32(v.neon, s);
|
||||
|
|
@ -275,7 +275,16 @@ static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s)
|
|||
}
|
||||
|
||||
static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
#if defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m128 a_low = _mm256_castps256_ps128(a.avx2);
|
||||
__m128 b_low = _mm256_castps256_ps128(b.avx2);
|
||||
__m128 mul = _mm_mul_ps(a_low, b_low);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 mul = _mm_mul_ps(a.sse, b.sse);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
|
|
@ -295,7 +304,16 @@ static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
|||
}
|
||||
|
||||
static inline f32 pxl8_simd_dot4_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
#if defined(PXL8_SIMD_SSE2) || defined(__SSE__)
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m128 a_low = _mm256_castps256_ps128(a.avx2);
|
||||
__m128 b_low = _mm256_castps256_ps128(b.avx2);
|
||||
__m128 mul = _mm_mul_ps(a_low, b_low);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 mul = _mm_mul_ps(a.sse, b.sse);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue