remove simd, scalar math + compiler optimizations are good enough

This commit is contained in:
asrael 2025-11-11 12:26:22 -06:00
parent e2c7998663
commit 4d84122ef3
8 changed files with 41 additions and 509 deletions

44
pxl8.sh
View file

@ -80,26 +80,6 @@ compile_source_file() {
fi fi
} }
detect_simd() {
local simd_flags=""
case "$(uname -m)" in
x86_64|amd64)
if $CC -mavx2 -c -x c /dev/null -o /dev/null 2>/dev/null; then
simd_flags="-mavx2 -msse2"
elif $CC -msse2 -c -x c /dev/null -o /dev/null 2>/dev/null; then
simd_flags="-msse2"
fi
;;
arm64|aarch64)
if $CC -march=armv8-a+simd -c -x c /dev/null -o /dev/null 2>/dev/null; then
simd_flags="-march=armv8-a+simd"
fi
;;
esac
echo "$simd_flags"
}
make_lib_dirs() { make_lib_dirs() {
mkdir -p lib/linenoise lib/fennel lib/microui/src lib/miniz mkdir -p lib/linenoise lib/fennel lib/microui/src lib/miniz
@ -294,15 +274,12 @@ for arg in "$@"; do
esac esac
done done
SIMD_FLAGS=$(detect_simd)
if [ "$MODE" = "release" ]; then if [ "$MODE" = "release" ]; then
CFLAGS="$CFLAGS -O3 -march=native -mtune=native -ffast-math -funroll-loops -fno-unwind-tables -fno-asynchronous-unwind-tables $SIMD_FLAGS" CFLAGS="$CFLAGS -O3 -ffast-math -funroll-loops -fno-unwind-tables -fno-asynchronous-unwind-tables"
BUILDDIR="$BUILDDIR/release" BUILDDIR="$BUILDDIR/release"
BINDIR="$BINDIR/release" BINDIR="$BINDIR/release"
else else
CFLAGS="$CFLAGS -g -O1 -DDEBUG $SIMD_FLAGS" CFLAGS="$CFLAGS -g -O1 -DDEBUG"
BUILDDIR="$BUILDDIR/debug" BUILDDIR="$BUILDDIR/debug"
BINDIR="$BINDIR/debug" BINDIR="$BINDIR/debug"
fi fi
@ -354,23 +331,6 @@ case "$COMMAND" in
print_info "Compiler cache: ccache enabled" print_info "Compiler cache: ccache enabled"
fi fi
if [[ -n "$SIMD_FLAGS" ]]; then
case "$(uname -m)" in
x86_64|amd64)
if [[ "$SIMD_FLAGS" == *"mavx2"* ]]; then
print_info "SIMD: AVX2 + SSE2 enabled"
elif [[ "$SIMD_FLAGS" == *"msse2"* ]]; then
print_info "SIMD: SSE2 enabled"
fi
;;
arm64|aarch64)
print_info "SIMD: ARM NEON enabled"
;;
esac
else
print_info "SIMD: Scalar fallback"
fi
INCLUDES="-Isrc -Ilib -Ilib/microui/src -Ilib/luajit/src -Ilib/linenoise -Ilib/miniz" INCLUDES="-Isrc -Ilib -Ilib/microui/src -Ilib/luajit/src -Ilib/linenoise -Ilib/miniz"
COMPILE_FLAGS="$CFLAGS $INCLUDES" COMPILE_FLAGS="$CFLAGS $INCLUDES"

View file

@ -1,7 +1,6 @@
#include "pxl8_blit.h" #include "pxl8_blit.h"
#include "pxl8_simd.h"
void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_width, void pxl8_blit_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_width,
i32 x, i32 y, u32 w, u32 h) { i32 x, i32 y, u32 w, u32 h) {
u32* dest_base = fb + y * fb_width + x; u32* dest_base = fb + y * fb_width + x;
const u32* src_base = sprite; const u32* src_base = sprite;
@ -10,19 +9,7 @@ void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_
u32* dest_row = dest_base + row * fb_width; u32* dest_row = dest_base + row * fb_width;
const u32* src_row = src_base + row * atlas_width; const u32* src_row = src_base + row * atlas_width;
u32 col = 0; for (u32 col = 0; col < w; col++) {
for (; col + PXL8_SIMD_WIDTH_U32 <= w; col += PXL8_SIMD_WIDTH_U32) {
pxl8_simd_vec src_vec = pxl8_simd_load_u32(src_row + col);
pxl8_simd_vec dest_vec = pxl8_simd_load_u32(dest_row + col);
pxl8_simd_vec alpha_mask = pxl8_simd_alpha_mask_u32();
pxl8_simd_vec has_alpha = pxl8_simd_and(src_vec, alpha_mask);
pxl8_simd_vec zero = pxl8_simd_zero_u8();
pxl8_simd_vec mask = pxl8_simd_cmpeq_u32(has_alpha, zero);
pxl8_simd_vec result = pxl8_simd_blendv_u32(src_vec, dest_vec, mask);
pxl8_simd_store_u32(dest_row + col, result);
}
for (; col < w; col++) {
if (src_row[col] & 0xFF000000) { if (src_row[col] & 0xFF000000) {
dest_row[col] = src_row[col]; dest_row[col] = src_row[col];
} }
@ -30,7 +17,7 @@ void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_
} }
} }
void pxl8_blit_simd_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_width, void pxl8_blit_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_width,
i32 x, i32 y, u32 w, u32 h) { i32 x, i32 y, u32 w, u32 h) {
u8* dest_base = fb + y * fb_width + x; u8* dest_base = fb + y * fb_width + x;
const u8* src_base = sprite; const u8* src_base = sprite;
@ -39,17 +26,7 @@ void pxl8_blit_simd_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_wi
u8* dest_row = dest_base + row * fb_width; u8* dest_row = dest_base + row * fb_width;
const u8* src_row = src_base + row * atlas_width; const u8* src_row = src_base + row * atlas_width;
u32 col = 0; for (u32 col = 0; col < w; col++) {
for (; col + PXL8_SIMD_WIDTH_U8 <= w; col += PXL8_SIMD_WIDTH_U8) {
pxl8_simd_vec src_vec = pxl8_simd_load_u8(src_row + col);
pxl8_simd_vec dest_vec = pxl8_simd_load_u8(dest_row + col);
pxl8_simd_vec zero = pxl8_simd_zero_u8();
pxl8_simd_vec mask = pxl8_simd_cmpeq_u8(src_vec, zero);
pxl8_simd_vec result = pxl8_simd_blendv_u8(src_vec, dest_vec, mask);
pxl8_simd_store_u8(dest_row + col, result);
}
for (; col < w; col++) {
if (src_row[col] != 0) { if (src_row[col] != 0) {
dest_row[col] = src_row[col]; dest_row[col] = src_row[col];
} }

View file

@ -1,22 +1,17 @@
#pragma once #pragma once
#include "pxl8_simd.h"
#include "pxl8_types.h" #include "pxl8_types.h"
static inline bool pxl8_is_simd_aligned(u32 w) {
return w >= PXL8_SIMD_WIDTH_U8 && (w % PXL8_SIMD_WIDTH_U8 == 0);
}
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
void pxl8_blit_simd_hicolor( void pxl8_blit_hicolor(
u32* fb, u32 fb_width, u32* fb, u32 fb_width,
const u32* sprite, u32 atlas_width, const u32* sprite, u32 atlas_width,
i32 x, i32 y, u32 w, u32 h i32 x, i32 y, u32 w, u32 h
); );
void pxl8_blit_simd_indexed( void pxl8_blit_indexed(
u8* fb, u32 fb_width, u8* fb, u32 fb_width,
const u8* sprite, u32 atlas_width, const u8* sprite, u32 atlas_width,
i32 x, i32 y, u32 w, u32 h i32 x, i32 y, u32 w, u32 h

View file

@ -635,11 +635,11 @@ void pxl8_sprite(pxl8_gfx* gfx, u32 sprite_id, i32 x, i32 y, i32 w, i32 h) {
u32 atlas_width = pxl8_atlas_get_width(gfx->atlas); u32 atlas_width = pxl8_atlas_get_width(gfx->atlas);
const u8* atlas_pixels = pxl8_atlas_get_pixels(gfx->atlas); const u8* atlas_pixels = pxl8_atlas_get_pixels(gfx->atlas);
if (is_1to1_scale && is_unclipped && pxl8_is_simd_aligned(w)) { if (is_1to1_scale && is_unclipped) {
const u8* sprite_data = atlas_pixels + entry->y * atlas_width + entry->x; const u8* sprite_data = atlas_pixels + entry->y * atlas_width + entry->x;
if (gfx->color_mode == PXL8_COLOR_MODE_HICOLOR) { if (gfx->color_mode == PXL8_COLOR_MODE_HICOLOR) {
pxl8_blit_simd_hicolor( pxl8_blit_hicolor(
(u32*)gfx->framebuffer, (u32*)gfx->framebuffer,
gfx->framebuffer_width, gfx->framebuffer_width,
(const u32*)sprite_data, (const u32*)sprite_data,
@ -647,7 +647,7 @@ void pxl8_sprite(pxl8_gfx* gfx, u32 sprite_id, i32 x, i32 y, i32 w, i32 h) {
x, y, w, h x, y, w, h
); );
} else { } else {
pxl8_blit_simd_indexed( pxl8_blit_indexed(
gfx->framebuffer, gfx->framebuffer,
gfx->framebuffer_width, gfx->framebuffer_width,
sprite_data, sprite_data,
@ -810,20 +810,9 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
i32 count = gfx->zbuffer_width * gfx->zbuffer_height; i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
const f32 far_z = 1e30f; const f32 far_z = 1e30f;
#if !defined(PXL8_SIMD_SCALAR)
pxl8_simd_vec_f32 far_vec = pxl8_simd_set1_f32(far_z);
i32 i = 0;
for (; i + PXL8_SIMD_WIDTH_F32 <= count; i += PXL8_SIMD_WIDTH_F32) {
pxl8_simd_store_f32(&gfx->zbuffer[i], far_vec);
}
for (; i < count; i++) {
gfx->zbuffer[i] = far_z;
}
#else
for (i32 i = 0; i < count; i++) { for (i32 i = 0; i < count; i++) {
gfx->zbuffer[i] = far_z; gfx->zbuffer[i] = far_z;
} }
#endif
} }
void pxl8_3d_set_backface_culling(pxl8_gfx* gfx, bool culling) { void pxl8_3d_set_backface_culling(pxl8_gfx* gfx, bool culling) {

View file

@ -21,11 +21,8 @@ typedef enum {
} pxl8_log_level; } pxl8_log_level;
#ifndef PXL8_LOG_LEVEL #ifndef PXL8_LOG_LEVEL
#ifdef DEBUG // Temporary: Always use DEBUG level for benchmarking
#define PXL8_LOG_LEVEL PXL8_LOG_LEVEL_DEBUG #define PXL8_LOG_LEVEL PXL8_LOG_LEVEL_DEBUG
#else
#define PXL8_LOG_LEVEL PXL8_LOG_LEVEL_INFO
#endif
#endif #endif
static pxl8_log_level pxl8_current_log_level = PXL8_LOG_LEVEL; static pxl8_log_level pxl8_current_log_level = PXL8_LOG_LEVEL;

View file

@ -1,7 +1,6 @@
#include <math.h> #include <math.h>
#include "pxl8_math.h" #include "pxl8_math.h"
#include "pxl8_simd.h"
pxl8_vec2 pxl8_vec2_add(pxl8_vec2 a, pxl8_vec2 b) { pxl8_vec2 pxl8_vec2_add(pxl8_vec2 a, pxl8_vec2 b) {
return (pxl8_vec2){ return (pxl8_vec2){
@ -41,45 +40,31 @@ pxl8_vec2 pxl8_vec2_normalize(pxl8_vec2 v) {
} }
pxl8_vec3 pxl8_vec3_add(pxl8_vec3 a, pxl8_vec3 b) { pxl8_vec3 pxl8_vec3_add(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_add_f32(va, vb);
return (pxl8_vec3){ return (pxl8_vec3){
.x = result.f32_array[0], .x = a.x + b.x,
.y = result.f32_array[1], .y = a.y + b.y,
.z = result.f32_array[2], .z = a.z + b.z,
}; };
} }
pxl8_vec3 pxl8_vec3_sub(pxl8_vec3 a, pxl8_vec3 b) { pxl8_vec3 pxl8_vec3_sub(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_sub_f32(va, vb);
return (pxl8_vec3){ return (pxl8_vec3){
.x = result.f32_array[0], .x = a.x - b.x,
.y = result.f32_array[1], .y = a.y - b.y,
.z = result.f32_array[2], .z = a.z - b.z,
}; };
} }
pxl8_vec3 pxl8_vec3_scale(pxl8_vec3 v, f32 s) { pxl8_vec3 pxl8_vec3_scale(pxl8_vec3 v, f32 s) {
pxl8_simd_vec_f32 vv = pxl8_simd_set_f32(v.x, v.y, v.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_scale_f32(vv, s);
return (pxl8_vec3){ return (pxl8_vec3){
.x = result.f32_array[0], .x = v.x * s,
.y = result.f32_array[1], .y = v.y * s,
.z = result.f32_array[2], .z = v.z * s,
}; };
} }
f32 pxl8_vec3_dot(pxl8_vec3 a, pxl8_vec3 b) { f32 pxl8_vec3_dot(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0); return a.x * b.x + a.y * b.y + a.z * b.z;
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
return pxl8_simd_dot3_f32(va, vb);
} }
pxl8_vec3 pxl8_vec3_cross(pxl8_vec3 a, pxl8_vec3 b) { pxl8_vec3 pxl8_vec3_cross(pxl8_vec3 a, pxl8_vec3 b) {
@ -115,13 +100,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
for (i32 i = 0; i < 4; i++) { for (i32 i = 0; i < 4; i++) {
for (i32 j = 0; j < 4; j++) { for (i32 j = 0; j < 4; j++) {
pxl8_simd_vec_f32 row = pxl8_simd_set_f32( mat.m[i * 4 + j] =
a.m[i * 4 + 0], a.m[i * 4 + 1], a.m[i * 4 + 2], a.m[i * 4 + 3] a.m[i * 4 + 0] * b.m[0 * 4 + j] +
); a.m[i * 4 + 1] * b.m[1 * 4 + j] +
pxl8_simd_vec_f32 col = pxl8_simd_set_f32( a.m[i * 4 + 2] * b.m[2 * 4 + j] +
b.m[0 * 4 + j], b.m[1 * 4 + j], b.m[2 * 4 + j], b.m[3 * 4 + j] a.m[i * 4 + 3] * b.m[3 * 4 + j];
);
mat.m[i * 4 + j] = pxl8_simd_dot4_f32(row, col);
} }
} }
@ -129,17 +112,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
} }
pxl8_vec4 pxl8_mat4_multiply_vec4(pxl8_mat4 m, pxl8_vec4 v) { pxl8_vec4 pxl8_mat4_multiply_vec4(pxl8_mat4 m, pxl8_vec4 v) {
pxl8_simd_vec_f32 row0 = pxl8_simd_set_f32(m.m[0], m.m[1], m.m[2], m.m[3]);
pxl8_simd_vec_f32 row1 = pxl8_simd_set_f32(m.m[4], m.m[5], m.m[6], m.m[7]);
pxl8_simd_vec_f32 row2 = pxl8_simd_set_f32(m.m[8], m.m[9], m.m[10], m.m[11]);
pxl8_simd_vec_f32 row3 = pxl8_simd_set_f32(m.m[12], m.m[13], m.m[14], m.m[15]);
pxl8_simd_vec_f32 vec = pxl8_simd_set_f32(v.x, v.y, v.z, v.w);
return (pxl8_vec4){ return (pxl8_vec4){
.x = pxl8_simd_dot4_f32(row0, vec), .x = m.m[0] * v.x + m.m[1] * v.y + m.m[2] * v.z + m.m[3] * v.w,
.y = pxl8_simd_dot4_f32(row1, vec), .y = m.m[4] * v.x + m.m[5] * v.y + m.m[6] * v.z + m.m[7] * v.w,
.z = pxl8_simd_dot4_f32(row2, vec), .z = m.m[8] * v.x + m.m[9] * v.y + m.m[10] * v.z + m.m[11] * v.w,
.w = pxl8_simd_dot4_f32(row3, vec), .w = m.m[12] * v.x + m.m[13] * v.y + m.m[14] * v.z + m.m[15] * v.w,
}; };
} }

View file

@ -57,8 +57,9 @@ static void* sdl3_create(pxl8_color_mode mode, pxl8_resolution resolution,
return NULL; return NULL;
} }
if (!SDL_SetRenderVSync(ctx->renderer, 1)) { // Disable vsync for benchmarking
pxl8_error("Failed to enable vsync: %s", SDL_GetError()); if (!SDL_SetRenderVSync(ctx->renderer, 0)) {
pxl8_error("Failed to set vsync: %s", SDL_GetError());
} }
SDL_SetRenderLogicalPresentation(ctx->renderer, fb_w, fb_h, SDL_SetRenderLogicalPresentation(ctx->renderer, fb_w, fb_h,

View file

@ -1,364 +0,0 @@
#pragma once
#include "pxl8_types.h"
#if defined(__AVX2__)
#include <immintrin.h>
#define PXL8_SIMD_AVX2 1
#define PXL8_SIMD_WIDTH_U8 32
#define PXL8_SIMD_WIDTH_U32 8
#define PXL8_SIMD_WIDTH_F32 8
#elif defined(__SSE2__)
#include <emmintrin.h>
#define PXL8_SIMD_SSE2 1
#define PXL8_SIMD_WIDTH_U8 16
#define PXL8_SIMD_WIDTH_U32 4
#define PXL8_SIMD_WIDTH_F32 4
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#define PXL8_SIMD_NEON 1
#define PXL8_SIMD_WIDTH_U8 16
#define PXL8_SIMD_WIDTH_U32 4
#define PXL8_SIMD_WIDTH_F32 4
#else
#define PXL8_SIMD_SCALAR 1
#define PXL8_SIMD_WIDTH_U8 1
#define PXL8_SIMD_WIDTH_U32 1
#define PXL8_SIMD_WIDTH_F32 1
#endif
typedef union {
#if defined(PXL8_SIMD_AVX2)
__m256i avx2;
#elif defined(PXL8_SIMD_SSE2)
__m128i sse2;
#elif defined(PXL8_SIMD_NEON)
uint8x16_t neon_u8;
uint32x4_t neon_u32;
#endif
u8 u8_array[32];
u32 u32_array[8];
} pxl8_simd_vec;
static inline pxl8_simd_vec pxl8_simd_load_u8(const u8* src) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_loadu_si256((__m256i*)src);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_loadu_si128((__m128i*)src);
#elif defined(PXL8_SIMD_NEON)
result.neon_u8 = vld1q_u8(src);
#else
result.u8_array[0] = src[0];
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_load_u32(const u32* src) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_loadu_si256((__m256i*)src);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_loadu_si128((__m128i*)src);
#elif defined(PXL8_SIMD_NEON)
result.neon_u32 = vld1q_u32(src);
#else
result.u32_array[0] = src[0];
#endif
return result;
}
static inline void pxl8_simd_store_u8(u8* dest, pxl8_simd_vec vec) {
#if defined(PXL8_SIMD_AVX2)
_mm256_storeu_si256((__m256i*)dest, vec.avx2);
#elif defined(PXL8_SIMD_SSE2)
_mm_storeu_si128((__m128i*)dest, vec.sse2);
#elif defined(PXL8_SIMD_NEON)
vst1q_u8(dest, vec.neon_u8);
#else
dest[0] = vec.u8_array[0];
#endif
}
static inline void pxl8_simd_store_u32(u32* dest, pxl8_simd_vec vec) {
#if defined(PXL8_SIMD_AVX2)
_mm256_storeu_si256((__m256i*)dest, vec.avx2);
#elif defined(PXL8_SIMD_SSE2)
_mm_storeu_si128((__m128i*)dest, vec.sse2);
#elif defined(PXL8_SIMD_NEON)
vst1q_u32(dest, vec.neon_u32);
#else
dest[0] = vec.u32_array[0];
#endif
}
static inline pxl8_simd_vec pxl8_simd_zero_u8(void) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_setzero_si256();
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_setzero_si128();
#elif defined(PXL8_SIMD_NEON)
result.neon_u8 = vdupq_n_u8(0);
#else
result.u8_array[0] = 0;
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_alpha_mask_u32(void) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_set1_epi32(0xFF000000);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_set1_epi32(0xFF000000);
#elif defined(PXL8_SIMD_NEON)
result.neon_u32 = vdupq_n_u32(0xFF000000);
#else
result.u32_array[0] = 0xFF000000;
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_and(pxl8_simd_vec a, pxl8_simd_vec b) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_and_si256(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_and_si128(a.sse2, b.sse2);
#elif defined(PXL8_SIMD_NEON)
result.neon_u8 = vandq_u8(a.neon_u8, b.neon_u8);
#else
result.u8_array[0] = a.u8_array[0] & b.u8_array[0];
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_cmpeq_u8(pxl8_simd_vec a, pxl8_simd_vec b) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_cmpeq_epi8(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_cmpeq_epi8(a.sse2, b.sse2);
#elif defined(PXL8_SIMD_NEON)
result.neon_u8 = vceqq_u8(a.neon_u8, b.neon_u8);
#else
result.u8_array[0] = (a.u8_array[0] == b.u8_array[0]) ? 0xFF : 0x00;
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_cmpeq_u32(pxl8_simd_vec a, pxl8_simd_vec b) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_cmpeq_epi32(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse2 = _mm_cmpeq_epi32(a.sse2, b.sse2);
#elif defined(PXL8_SIMD_NEON)
result.neon_u32 = vceqq_u32(a.neon_u32, b.neon_u32);
#else
result.u32_array[0] = (a.u32_array[0] == b.u32_array[0]) ? 0xFFFFFFFF : 0x00000000;
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_blendv_u8(pxl8_simd_vec src, pxl8_simd_vec dest, pxl8_simd_vec mask) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_blendv_epi8(src.avx2, dest.avx2, mask.avx2);
#elif defined(PXL8_SIMD_SSE2)
pxl8_simd_vec not_mask; not_mask.sse2 = _mm_xor_si128(mask.sse2, _mm_set1_epi8(-1));
result.sse2 = _mm_or_si128(_mm_and_si128(mask.sse2, dest.sse2), _mm_and_si128(not_mask.sse2, src.sse2));
#elif defined(PXL8_SIMD_NEON)
result.neon_u8 = vbslq_u8(mask.neon_u8, dest.neon_u8, src.neon_u8);
#else
result.u8_array[0] = mask.u8_array[0] ? dest.u8_array[0] : src.u8_array[0];
#endif
return result;
}
static inline pxl8_simd_vec pxl8_simd_blendv_u32(pxl8_simd_vec src, pxl8_simd_vec dest, pxl8_simd_vec mask) {
pxl8_simd_vec result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_blendv_epi8(src.avx2, dest.avx2, mask.avx2);
#elif defined(PXL8_SIMD_SSE2)
pxl8_simd_vec not_mask; not_mask.sse2 = _mm_xor_si128(mask.sse2, _mm_set1_epi32(-1));
result.sse2 = _mm_or_si128(_mm_and_si128(mask.sse2, dest.sse2), _mm_and_si128(not_mask.sse2, src.sse2));
#elif defined(PXL8_SIMD_NEON)
result.neon_u32 = vbslq_u32(mask.neon_u32, dest.neon_u32, src.neon_u32);
#else
result.u32_array[0] = mask.u32_array[0] ? dest.u32_array[0] : src.u32_array[0];
#endif
return result;
}
typedef union {
#if defined(PXL8_SIMD_AVX2)
__m256 avx2;
__m128 sse;
#elif defined(PXL8_SIMD_SSE2)
__m128 sse;
#elif defined(PXL8_SIMD_NEON)
float32x4_t neon;
#endif
f32 f32_array[8];
} pxl8_simd_vec_f32;
static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_set_ps(0, 0, 0, 0, w, z, y, x);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_set_ps(w, z, y, x);
#elif defined(PXL8_SIMD_NEON)
f32 data[4] = {x, y, z, w};
result.neon = vld1q_f32(data);
#else
result.f32_array[0] = x;
result.f32_array[1] = y;
result.f32_array[2] = z;
result.f32_array[3] = w;
#endif
return result;
}
static inline pxl8_simd_vec_f32 pxl8_simd_set1_f32(f32 value) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_set1_ps(value);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_set1_ps(value);
#elif defined(PXL8_SIMD_NEON)
result.neon = vdupq_n_f32(value);
#else
result.f32_array[0] = value;
#endif
return result;
}
static inline void pxl8_simd_store_f32(f32* dest, pxl8_simd_vec_f32 vec) {
#if defined(PXL8_SIMD_AVX2)
_mm256_storeu_ps(dest, vec.avx2);
#elif defined(PXL8_SIMD_SSE2)
_mm_storeu_ps(dest, vec.sse);
#elif defined(PXL8_SIMD_NEON)
vst1q_f32(dest, vec.neon);
#else
dest[0] = vec.f32_array[0];
#endif
}
static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_add_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_add_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vaddq_f32(a.neon, b.neon);
#else
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] + b.f32_array[i];
#endif
return result;
}
static inline pxl8_simd_vec_f32 pxl8_simd_sub_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_sub_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_sub_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vsubq_f32(a.neon, b.neon);
#else
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] - b.f32_array[i];
#endif
return result;
}
static inline pxl8_simd_vec_f32 pxl8_simd_mul_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_mul_ps(a.avx2, b.avx2);
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_mul_ps(a.sse, b.sse);
#elif defined(PXL8_SIMD_NEON)
result.neon = vmulq_f32(a.neon, b.neon);
#else
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] * b.f32_array[i];
#endif
return result;
}
static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s) {
pxl8_simd_vec_f32 result;
#if defined(PXL8_SIMD_AVX2)
result.avx2 = _mm256_mul_ps(v.avx2, _mm256_set1_ps(s));
#elif defined(PXL8_SIMD_SSE2)
result.sse = _mm_mul_ps(v.sse, _mm_set1_ps(s));
#elif defined(PXL8_SIMD_NEON)
result.neon = vmulq_n_f32(v.neon, s);
#else
for (i32 i = 0; i < 4; i++) result.f32_array[i] = v.f32_array[i] * s;
#endif
return result;
}
static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
#if defined(PXL8_SIMD_AVX2)
__m128 a_low = _mm256_castps256_ps128(a.avx2);
__m128 b_low = _mm256_castps256_ps128(b.avx2);
__m128 mul = _mm_mul_ps(a_low, b_low);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_SSE2)
__m128 mul = _mm_mul_ps(a.sse, b.sse);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_NEON)
float32x4_t mul = vmulq_f32(a.neon, b.neon);
float32x2_t sum = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
sum = vpadd_f32(sum, sum);
return vget_lane_f32(sum, 0);
#else
return a.f32_array[0] * b.f32_array[0] +
a.f32_array[1] * b.f32_array[1] +
a.f32_array[2] * b.f32_array[2];
#endif
}
static inline f32 pxl8_simd_dot4_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
#if defined(PXL8_SIMD_AVX2)
__m128 a_low = _mm256_castps256_ps128(a.avx2);
__m128 b_low = _mm256_castps256_ps128(b.avx2);
__m128 mul = _mm_mul_ps(a_low, b_low);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_SSE2)
__m128 mul = _mm_mul_ps(a.sse, b.sse);
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
#elif defined(PXL8_SIMD_NEON)
float32x4_t mul = vmulq_f32(a.neon, b.neon);
float32x2_t sum = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
sum = vpadd_f32(sum, sum);
return vget_lane_f32(sum, 0);
#else
return a.f32_array[0] * b.f32_array[0] +
a.f32_array[1] * b.f32_array[1] +
a.f32_array[2] * b.f32_array[2] +
a.f32_array[3] * b.f32_array[3];
#endif
}