remove simd, scalar math + compiler optimizations are good enough
This commit is contained in:
parent
e2c7998663
commit
4d84122ef3
8 changed files with 41 additions and 509 deletions
44
pxl8.sh
44
pxl8.sh
|
|
@ -80,26 +80,6 @@ compile_source_file() {
|
|||
fi
|
||||
}
|
||||
|
||||
detect_simd() {
|
||||
local simd_flags=""
|
||||
|
||||
case "$(uname -m)" in
|
||||
x86_64|amd64)
|
||||
if $CC -mavx2 -c -x c /dev/null -o /dev/null 2>/dev/null; then
|
||||
simd_flags="-mavx2 -msse2"
|
||||
elif $CC -msse2 -c -x c /dev/null -o /dev/null 2>/dev/null; then
|
||||
simd_flags="-msse2"
|
||||
fi
|
||||
;;
|
||||
arm64|aarch64)
|
||||
if $CC -march=armv8-a+simd -c -x c /dev/null -o /dev/null 2>/dev/null; then
|
||||
simd_flags="-march=armv8-a+simd"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "$simd_flags"
|
||||
}
|
||||
|
||||
make_lib_dirs() {
|
||||
mkdir -p lib/linenoise lib/fennel lib/microui/src lib/miniz
|
||||
|
|
@ -294,15 +274,12 @@ for arg in "$@"; do
|
|||
esac
|
||||
done
|
||||
|
||||
SIMD_FLAGS=$(detect_simd)
|
||||
|
||||
if [ "$MODE" = "release" ]; then
|
||||
CFLAGS="$CFLAGS -O3 -march=native -mtune=native -ffast-math -funroll-loops -fno-unwind-tables -fno-asynchronous-unwind-tables $SIMD_FLAGS"
|
||||
|
||||
CFLAGS="$CFLAGS -O3 -ffast-math -funroll-loops -fno-unwind-tables -fno-asynchronous-unwind-tables"
|
||||
BUILDDIR="$BUILDDIR/release"
|
||||
BINDIR="$BINDIR/release"
|
||||
else
|
||||
CFLAGS="$CFLAGS -g -O1 -DDEBUG $SIMD_FLAGS"
|
||||
CFLAGS="$CFLAGS -g -O1 -DDEBUG"
|
||||
BUILDDIR="$BUILDDIR/debug"
|
||||
BINDIR="$BINDIR/debug"
|
||||
fi
|
||||
|
|
@ -354,23 +331,6 @@ case "$COMMAND" in
|
|||
print_info "Compiler cache: ccache enabled"
|
||||
fi
|
||||
|
||||
if [[ -n "$SIMD_FLAGS" ]]; then
|
||||
case "$(uname -m)" in
|
||||
x86_64|amd64)
|
||||
if [[ "$SIMD_FLAGS" == *"mavx2"* ]]; then
|
||||
print_info "SIMD: AVX2 + SSE2 enabled"
|
||||
elif [[ "$SIMD_FLAGS" == *"msse2"* ]]; then
|
||||
print_info "SIMD: SSE2 enabled"
|
||||
fi
|
||||
;;
|
||||
arm64|aarch64)
|
||||
print_info "SIMD: ARM NEON enabled"
|
||||
;;
|
||||
esac
|
||||
else
|
||||
print_info "SIMD: Scalar fallback"
|
||||
fi
|
||||
|
||||
INCLUDES="-Isrc -Ilib -Ilib/microui/src -Ilib/luajit/src -Ilib/linenoise -Ilib/miniz"
|
||||
COMPILE_FLAGS="$CFLAGS $INCLUDES"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
#include "pxl8_blit.h"
|
||||
#include "pxl8_simd.h"
|
||||
|
||||
void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_width,
|
||||
void pxl8_blit_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_width,
|
||||
i32 x, i32 y, u32 w, u32 h) {
|
||||
u32* dest_base = fb + y * fb_width + x;
|
||||
const u32* src_base = sprite;
|
||||
|
|
@ -10,19 +9,7 @@ void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_
|
|||
u32* dest_row = dest_base + row * fb_width;
|
||||
const u32* src_row = src_base + row * atlas_width;
|
||||
|
||||
u32 col = 0;
|
||||
for (; col + PXL8_SIMD_WIDTH_U32 <= w; col += PXL8_SIMD_WIDTH_U32) {
|
||||
pxl8_simd_vec src_vec = pxl8_simd_load_u32(src_row + col);
|
||||
pxl8_simd_vec dest_vec = pxl8_simd_load_u32(dest_row + col);
|
||||
pxl8_simd_vec alpha_mask = pxl8_simd_alpha_mask_u32();
|
||||
pxl8_simd_vec has_alpha = pxl8_simd_and(src_vec, alpha_mask);
|
||||
pxl8_simd_vec zero = pxl8_simd_zero_u8();
|
||||
pxl8_simd_vec mask = pxl8_simd_cmpeq_u32(has_alpha, zero);
|
||||
pxl8_simd_vec result = pxl8_simd_blendv_u32(src_vec, dest_vec, mask);
|
||||
pxl8_simd_store_u32(dest_row + col, result);
|
||||
}
|
||||
|
||||
for (; col < w; col++) {
|
||||
for (u32 col = 0; col < w; col++) {
|
||||
if (src_row[col] & 0xFF000000) {
|
||||
dest_row[col] = src_row[col];
|
||||
}
|
||||
|
|
@ -30,7 +17,7 @@ void pxl8_blit_simd_hicolor(u32* fb, u32 fb_width, const u32* sprite, u32 atlas_
|
|||
}
|
||||
}
|
||||
|
||||
void pxl8_blit_simd_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_width,
|
||||
void pxl8_blit_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_width,
|
||||
i32 x, i32 y, u32 w, u32 h) {
|
||||
u8* dest_base = fb + y * fb_width + x;
|
||||
const u8* src_base = sprite;
|
||||
|
|
@ -39,17 +26,7 @@ void pxl8_blit_simd_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_wi
|
|||
u8* dest_row = dest_base + row * fb_width;
|
||||
const u8* src_row = src_base + row * atlas_width;
|
||||
|
||||
u32 col = 0;
|
||||
for (; col + PXL8_SIMD_WIDTH_U8 <= w; col += PXL8_SIMD_WIDTH_U8) {
|
||||
pxl8_simd_vec src_vec = pxl8_simd_load_u8(src_row + col);
|
||||
pxl8_simd_vec dest_vec = pxl8_simd_load_u8(dest_row + col);
|
||||
pxl8_simd_vec zero = pxl8_simd_zero_u8();
|
||||
pxl8_simd_vec mask = pxl8_simd_cmpeq_u8(src_vec, zero);
|
||||
pxl8_simd_vec result = pxl8_simd_blendv_u8(src_vec, dest_vec, mask);
|
||||
pxl8_simd_store_u8(dest_row + col, result);
|
||||
}
|
||||
|
||||
for (; col < w; col++) {
|
||||
for (u32 col = 0; col < w; col++) {
|
||||
if (src_row[col] != 0) {
|
||||
dest_row[col] = src_row[col];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,22 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "pxl8_simd.h"
|
||||
#include "pxl8_types.h"
|
||||
|
||||
static inline bool pxl8_is_simd_aligned(u32 w) {
|
||||
return w >= PXL8_SIMD_WIDTH_U8 && (w % PXL8_SIMD_WIDTH_U8 == 0);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void pxl8_blit_simd_hicolor(
|
||||
void pxl8_blit_hicolor(
|
||||
u32* fb, u32 fb_width,
|
||||
const u32* sprite, u32 atlas_width,
|
||||
i32 x, i32 y, u32 w, u32 h
|
||||
);
|
||||
void pxl8_blit_simd_indexed(
|
||||
void pxl8_blit_indexed(
|
||||
u8* fb, u32 fb_width,
|
||||
const u8* sprite, u32 atlas_width,
|
||||
i32 x, i32 y, u32 w, u32 h
|
||||
|
|
|
|||
|
|
@ -635,11 +635,11 @@ void pxl8_sprite(pxl8_gfx* gfx, u32 sprite_id, i32 x, i32 y, i32 w, i32 h) {
|
|||
u32 atlas_width = pxl8_atlas_get_width(gfx->atlas);
|
||||
const u8* atlas_pixels = pxl8_atlas_get_pixels(gfx->atlas);
|
||||
|
||||
if (is_1to1_scale && is_unclipped && pxl8_is_simd_aligned(w)) {
|
||||
if (is_1to1_scale && is_unclipped) {
|
||||
const u8* sprite_data = atlas_pixels + entry->y * atlas_width + entry->x;
|
||||
|
||||
if (gfx->color_mode == PXL8_COLOR_MODE_HICOLOR) {
|
||||
pxl8_blit_simd_hicolor(
|
||||
pxl8_blit_hicolor(
|
||||
(u32*)gfx->framebuffer,
|
||||
gfx->framebuffer_width,
|
||||
(const u32*)sprite_data,
|
||||
|
|
@ -647,7 +647,7 @@ void pxl8_sprite(pxl8_gfx* gfx, u32 sprite_id, i32 x, i32 y, i32 w, i32 h) {
|
|||
x, y, w, h
|
||||
);
|
||||
} else {
|
||||
pxl8_blit_simd_indexed(
|
||||
pxl8_blit_indexed(
|
||||
gfx->framebuffer,
|
||||
gfx->framebuffer_width,
|
||||
sprite_data,
|
||||
|
|
@ -810,20 +810,9 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) {
|
|||
i32 count = gfx->zbuffer_width * gfx->zbuffer_height;
|
||||
const f32 far_z = 1e30f;
|
||||
|
||||
#if !defined(PXL8_SIMD_SCALAR)
|
||||
pxl8_simd_vec_f32 far_vec = pxl8_simd_set1_f32(far_z);
|
||||
i32 i = 0;
|
||||
for (; i + PXL8_SIMD_WIDTH_F32 <= count; i += PXL8_SIMD_WIDTH_F32) {
|
||||
pxl8_simd_store_f32(&gfx->zbuffer[i], far_vec);
|
||||
}
|
||||
for (; i < count; i++) {
|
||||
gfx->zbuffer[i] = far_z;
|
||||
}
|
||||
#else
|
||||
for (i32 i = 0; i < count; i++) {
|
||||
gfx->zbuffer[i] = far_z;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void pxl8_3d_set_backface_culling(pxl8_gfx* gfx, bool culling) {
|
||||
|
|
|
|||
|
|
@ -21,11 +21,8 @@ typedef enum {
|
|||
} pxl8_log_level;
|
||||
|
||||
#ifndef PXL8_LOG_LEVEL
|
||||
#ifdef DEBUG
|
||||
// Temporary: Always use DEBUG level for benchmarking
|
||||
#define PXL8_LOG_LEVEL PXL8_LOG_LEVEL_DEBUG
|
||||
#else
|
||||
#define PXL8_LOG_LEVEL PXL8_LOG_LEVEL_INFO
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static pxl8_log_level pxl8_current_log_level = PXL8_LOG_LEVEL;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
#include <math.h>
|
||||
|
||||
#include "pxl8_math.h"
|
||||
#include "pxl8_simd.h"
|
||||
|
||||
pxl8_vec2 pxl8_vec2_add(pxl8_vec2 a, pxl8_vec2 b) {
|
||||
return (pxl8_vec2){
|
||||
|
|
@ -41,45 +40,31 @@ pxl8_vec2 pxl8_vec2_normalize(pxl8_vec2 v) {
|
|||
}
|
||||
|
||||
pxl8_vec3 pxl8_vec3_add(pxl8_vec3 a, pxl8_vec3 b) {
|
||||
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
|
||||
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
|
||||
pxl8_simd_vec_f32 result = pxl8_simd_add_f32(va, vb);
|
||||
|
||||
return (pxl8_vec3){
|
||||
.x = result.f32_array[0],
|
||||
.y = result.f32_array[1],
|
||||
.z = result.f32_array[2],
|
||||
.x = a.x + b.x,
|
||||
.y = a.y + b.y,
|
||||
.z = a.z + b.z,
|
||||
};
|
||||
}
|
||||
|
||||
pxl8_vec3 pxl8_vec3_sub(pxl8_vec3 a, pxl8_vec3 b) {
|
||||
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
|
||||
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
|
||||
pxl8_simd_vec_f32 result = pxl8_simd_sub_f32(va, vb);
|
||||
|
||||
return (pxl8_vec3){
|
||||
.x = result.f32_array[0],
|
||||
.y = result.f32_array[1],
|
||||
.z = result.f32_array[2],
|
||||
.x = a.x - b.x,
|
||||
.y = a.y - b.y,
|
||||
.z = a.z - b.z,
|
||||
};
|
||||
}
|
||||
|
||||
pxl8_vec3 pxl8_vec3_scale(pxl8_vec3 v, f32 s) {
|
||||
pxl8_simd_vec_f32 vv = pxl8_simd_set_f32(v.x, v.y, v.z, 0);
|
||||
pxl8_simd_vec_f32 result = pxl8_simd_scale_f32(vv, s);
|
||||
|
||||
return (pxl8_vec3){
|
||||
.x = result.f32_array[0],
|
||||
.y = result.f32_array[1],
|
||||
.z = result.f32_array[2],
|
||||
.x = v.x * s,
|
||||
.y = v.y * s,
|
||||
.z = v.z * s,
|
||||
};
|
||||
}
|
||||
|
||||
f32 pxl8_vec3_dot(pxl8_vec3 a, pxl8_vec3 b) {
|
||||
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
|
||||
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
|
||||
|
||||
return pxl8_simd_dot3_f32(va, vb);
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
pxl8_vec3 pxl8_vec3_cross(pxl8_vec3 a, pxl8_vec3 b) {
|
||||
|
|
@ -115,13 +100,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
|
|||
|
||||
for (i32 i = 0; i < 4; i++) {
|
||||
for (i32 j = 0; j < 4; j++) {
|
||||
pxl8_simd_vec_f32 row = pxl8_simd_set_f32(
|
||||
a.m[i * 4 + 0], a.m[i * 4 + 1], a.m[i * 4 + 2], a.m[i * 4 + 3]
|
||||
);
|
||||
pxl8_simd_vec_f32 col = pxl8_simd_set_f32(
|
||||
b.m[0 * 4 + j], b.m[1 * 4 + j], b.m[2 * 4 + j], b.m[3 * 4 + j]
|
||||
);
|
||||
mat.m[i * 4 + j] = pxl8_simd_dot4_f32(row, col);
|
||||
mat.m[i * 4 + j] =
|
||||
a.m[i * 4 + 0] * b.m[0 * 4 + j] +
|
||||
a.m[i * 4 + 1] * b.m[1 * 4 + j] +
|
||||
a.m[i * 4 + 2] * b.m[2 * 4 + j] +
|
||||
a.m[i * 4 + 3] * b.m[3 * 4 + j];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -129,17 +112,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
|
|||
}
|
||||
|
||||
pxl8_vec4 pxl8_mat4_multiply_vec4(pxl8_mat4 m, pxl8_vec4 v) {
|
||||
pxl8_simd_vec_f32 row0 = pxl8_simd_set_f32(m.m[0], m.m[1], m.m[2], m.m[3]);
|
||||
pxl8_simd_vec_f32 row1 = pxl8_simd_set_f32(m.m[4], m.m[5], m.m[6], m.m[7]);
|
||||
pxl8_simd_vec_f32 row2 = pxl8_simd_set_f32(m.m[8], m.m[9], m.m[10], m.m[11]);
|
||||
pxl8_simd_vec_f32 row3 = pxl8_simd_set_f32(m.m[12], m.m[13], m.m[14], m.m[15]);
|
||||
pxl8_simd_vec_f32 vec = pxl8_simd_set_f32(v.x, v.y, v.z, v.w);
|
||||
|
||||
return (pxl8_vec4){
|
||||
.x = pxl8_simd_dot4_f32(row0, vec),
|
||||
.y = pxl8_simd_dot4_f32(row1, vec),
|
||||
.z = pxl8_simd_dot4_f32(row2, vec),
|
||||
.w = pxl8_simd_dot4_f32(row3, vec),
|
||||
.x = m.m[0] * v.x + m.m[1] * v.y + m.m[2] * v.z + m.m[3] * v.w,
|
||||
.y = m.m[4] * v.x + m.m[5] * v.y + m.m[6] * v.z + m.m[7] * v.w,
|
||||
.z = m.m[8] * v.x + m.m[9] * v.y + m.m[10] * v.z + m.m[11] * v.w,
|
||||
.w = m.m[12] * v.x + m.m[13] * v.y + m.m[14] * v.z + m.m[15] * v.w,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -57,8 +57,9 @@ static void* sdl3_create(pxl8_color_mode mode, pxl8_resolution resolution,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (!SDL_SetRenderVSync(ctx->renderer, 1)) {
|
||||
pxl8_error("Failed to enable vsync: %s", SDL_GetError());
|
||||
// Disable vsync for benchmarking
|
||||
if (!SDL_SetRenderVSync(ctx->renderer, 0)) {
|
||||
pxl8_error("Failed to set vsync: %s", SDL_GetError());
|
||||
}
|
||||
|
||||
SDL_SetRenderLogicalPresentation(ctx->renderer, fb_w, fb_h,
|
||||
|
|
|
|||
364
src/pxl8_simd.h
364
src/pxl8_simd.h
|
|
@ -1,364 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "pxl8_types.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#include <immintrin.h>
|
||||
#define PXL8_SIMD_AVX2 1
|
||||
#define PXL8_SIMD_WIDTH_U8 32
|
||||
#define PXL8_SIMD_WIDTH_U32 8
|
||||
#define PXL8_SIMD_WIDTH_F32 8
|
||||
#elif defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#define PXL8_SIMD_SSE2 1
|
||||
#define PXL8_SIMD_WIDTH_U8 16
|
||||
#define PXL8_SIMD_WIDTH_U32 4
|
||||
#define PXL8_SIMD_WIDTH_F32 4
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#define PXL8_SIMD_NEON 1
|
||||
#define PXL8_SIMD_WIDTH_U8 16
|
||||
#define PXL8_SIMD_WIDTH_U32 4
|
||||
#define PXL8_SIMD_WIDTH_F32 4
|
||||
#else
|
||||
#define PXL8_SIMD_SCALAR 1
|
||||
#define PXL8_SIMD_WIDTH_U8 1
|
||||
#define PXL8_SIMD_WIDTH_U32 1
|
||||
#define PXL8_SIMD_WIDTH_F32 1
|
||||
#endif
|
||||
|
||||
typedef union {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m256i avx2;
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128i sse2;
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
uint8x16_t neon_u8;
|
||||
uint32x4_t neon_u32;
|
||||
#endif
|
||||
u8 u8_array[32];
|
||||
u32 u32_array[8];
|
||||
} pxl8_simd_vec;
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_load_u8(const u8* src) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_loadu_si256((__m256i*)src);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_loadu_si128((__m128i*)src);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u8 = vld1q_u8(src);
|
||||
#else
|
||||
result.u8_array[0] = src[0];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_load_u32(const u32* src) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_loadu_si256((__m256i*)src);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_loadu_si128((__m128i*)src);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u32 = vld1q_u32(src);
|
||||
#else
|
||||
result.u32_array[0] = src[0];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void pxl8_simd_store_u8(u8* dest, pxl8_simd_vec vec) {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
_mm256_storeu_si256((__m256i*)dest, vec.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
_mm_storeu_si128((__m128i*)dest, vec.sse2);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
vst1q_u8(dest, vec.neon_u8);
|
||||
#else
|
||||
dest[0] = vec.u8_array[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void pxl8_simd_store_u32(u32* dest, pxl8_simd_vec vec) {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
_mm256_storeu_si256((__m256i*)dest, vec.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
_mm_storeu_si128((__m128i*)dest, vec.sse2);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
vst1q_u32(dest, vec.neon_u32);
|
||||
#else
|
||||
dest[0] = vec.u32_array[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_zero_u8(void) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_setzero_si256();
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_setzero_si128();
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u8 = vdupq_n_u8(0);
|
||||
#else
|
||||
result.u8_array[0] = 0;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_alpha_mask_u32(void) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_set1_epi32(0xFF000000);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_set1_epi32(0xFF000000);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u32 = vdupq_n_u32(0xFF000000);
|
||||
#else
|
||||
result.u32_array[0] = 0xFF000000;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_and(pxl8_simd_vec a, pxl8_simd_vec b) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_and_si256(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_and_si128(a.sse2, b.sse2);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u8 = vandq_u8(a.neon_u8, b.neon_u8);
|
||||
#else
|
||||
result.u8_array[0] = a.u8_array[0] & b.u8_array[0];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_cmpeq_u8(pxl8_simd_vec a, pxl8_simd_vec b) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_cmpeq_epi8(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_cmpeq_epi8(a.sse2, b.sse2);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u8 = vceqq_u8(a.neon_u8, b.neon_u8);
|
||||
#else
|
||||
result.u8_array[0] = (a.u8_array[0] == b.u8_array[0]) ? 0xFF : 0x00;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_cmpeq_u32(pxl8_simd_vec a, pxl8_simd_vec b) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_cmpeq_epi32(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse2 = _mm_cmpeq_epi32(a.sse2, b.sse2);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u32 = vceqq_u32(a.neon_u32, b.neon_u32);
|
||||
#else
|
||||
result.u32_array[0] = (a.u32_array[0] == b.u32_array[0]) ? 0xFFFFFFFF : 0x00000000;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_blendv_u8(pxl8_simd_vec src, pxl8_simd_vec dest, pxl8_simd_vec mask) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_blendv_epi8(src.avx2, dest.avx2, mask.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
pxl8_simd_vec not_mask; not_mask.sse2 = _mm_xor_si128(mask.sse2, _mm_set1_epi8(-1));
|
||||
result.sse2 = _mm_or_si128(_mm_and_si128(mask.sse2, dest.sse2), _mm_and_si128(not_mask.sse2, src.sse2));
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u8 = vbslq_u8(mask.neon_u8, dest.neon_u8, src.neon_u8);
|
||||
#else
|
||||
result.u8_array[0] = mask.u8_array[0] ? dest.u8_array[0] : src.u8_array[0];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec pxl8_simd_blendv_u32(pxl8_simd_vec src, pxl8_simd_vec dest, pxl8_simd_vec mask) {
|
||||
pxl8_simd_vec result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_blendv_epi8(src.avx2, dest.avx2, mask.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
pxl8_simd_vec not_mask; not_mask.sse2 = _mm_xor_si128(mask.sse2, _mm_set1_epi32(-1));
|
||||
result.sse2 = _mm_or_si128(_mm_and_si128(mask.sse2, dest.sse2), _mm_and_si128(not_mask.sse2, src.sse2));
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon_u32 = vbslq_u32(mask.neon_u32, dest.neon_u32, src.neon_u32);
|
||||
#else
|
||||
result.u32_array[0] = mask.u32_array[0] ? dest.u32_array[0] : src.u32_array[0];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
typedef union {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m256 avx2;
|
||||
__m128 sse;
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 sse;
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
float32x4_t neon;
|
||||
#endif
|
||||
f32 f32_array[8];
|
||||
} pxl8_simd_vec_f32;
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_set_f32(f32 x, f32 y, f32 z, f32 w) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_set_ps(0, 0, 0, 0, w, z, y, x);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_set_ps(w, z, y, x);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
f32 data[4] = {x, y, z, w};
|
||||
result.neon = vld1q_f32(data);
|
||||
#else
|
||||
result.f32_array[0] = x;
|
||||
result.f32_array[1] = y;
|
||||
result.f32_array[2] = z;
|
||||
result.f32_array[3] = w;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_set1_f32(f32 value) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_set1_ps(value);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_set1_ps(value);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vdupq_n_f32(value);
|
||||
#else
|
||||
result.f32_array[0] = value;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void pxl8_simd_store_f32(f32* dest, pxl8_simd_vec_f32 vec) {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
_mm256_storeu_ps(dest, vec.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
_mm_storeu_ps(dest, vec.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
vst1q_f32(dest, vec.neon);
|
||||
#else
|
||||
dest[0] = vec.f32_array[0];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_add_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_add_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_add_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vaddq_f32(a.neon, b.neon);
|
||||
#else
|
||||
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] + b.f32_array[i];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_sub_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_sub_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_sub_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vsubq_f32(a.neon, b.neon);
|
||||
#else
|
||||
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] - b.f32_array[i];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_mul_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_mul_ps(a.avx2, b.avx2);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_mul_ps(a.sse, b.sse);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vmulq_f32(a.neon, b.neon);
|
||||
#else
|
||||
for (i32 i = 0; i < 4; i++) result.f32_array[i] = a.f32_array[i] * b.f32_array[i];
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline pxl8_simd_vec_f32 pxl8_simd_scale_f32(pxl8_simd_vec_f32 v, f32 s) {
|
||||
pxl8_simd_vec_f32 result;
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
result.avx2 = _mm256_mul_ps(v.avx2, _mm256_set1_ps(s));
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
result.sse = _mm_mul_ps(v.sse, _mm_set1_ps(s));
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
result.neon = vmulq_n_f32(v.neon, s);
|
||||
#else
|
||||
for (i32 i = 0; i < 4; i++) result.f32_array[i] = v.f32_array[i] * s;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline f32 pxl8_simd_dot3_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m128 a_low = _mm256_castps256_ps128(a.avx2);
|
||||
__m128 b_low = _mm256_castps256_ps128(b.avx2);
|
||||
__m128 mul = _mm_mul_ps(a_low, b_low);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 mul = _mm_mul_ps(a.sse, b.sse);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 1, 0, 3));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
float32x4_t mul = vmulq_f32(a.neon, b.neon);
|
||||
float32x2_t sum = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
|
||||
sum = vpadd_f32(sum, sum);
|
||||
return vget_lane_f32(sum, 0);
|
||||
#else
|
||||
return a.f32_array[0] * b.f32_array[0] +
|
||||
a.f32_array[1] * b.f32_array[1] +
|
||||
a.f32_array[2] * b.f32_array[2];
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline f32 pxl8_simd_dot4_f32(pxl8_simd_vec_f32 a, pxl8_simd_vec_f32 b) {
|
||||
#if defined(PXL8_SIMD_AVX2)
|
||||
__m128 a_low = _mm256_castps256_ps128(a.avx2);
|
||||
__m128 b_low = _mm256_castps256_ps128(b.avx2);
|
||||
__m128 mul = _mm_mul_ps(a_low, b_low);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_SSE2)
|
||||
__m128 mul = _mm_mul_ps(a.sse, b.sse);
|
||||
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
__m128 sums = _mm_add_ps(mul, shuf);
|
||||
shuf = _mm_movehl_ps(shuf, sums);
|
||||
sums = _mm_add_ss(sums, shuf);
|
||||
return _mm_cvtss_f32(sums);
|
||||
#elif defined(PXL8_SIMD_NEON)
|
||||
float32x4_t mul = vmulq_f32(a.neon, b.neon);
|
||||
float32x2_t sum = vpadd_f32(vget_low_f32(mul), vget_high_f32(mul));
|
||||
sum = vpadd_f32(sum, sum);
|
||||
return vget_lane_f32(sum, 0);
|
||||
#else
|
||||
return a.f32_array[0] * b.f32_array[0] +
|
||||
a.f32_array[1] * b.f32_array[1] +
|
||||
a.f32_array[2] * b.f32_array[2] +
|
||||
a.f32_array[3] * b.f32_array[3];
|
||||
#endif
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue