#pragma once #include "pxl8_types.h" #if defined(__x86_64__) || defined(_M_X64) #define PXL8_SIMD_SSE2 1 #include #elif defined(__aarch64__) || defined(_M_ARM64) #define PXL8_SIMD_NEON 1 #include #else #define PXL8_SIMD_SCALAR 1 #endif #ifdef __cplusplus extern "C" { #endif #if defined(PXL8_SIMD_SSE2) typedef struct { __m128 v; } pxl8_f32x4; typedef struct { __m128i v; } pxl8_i32x4; typedef struct { __m128i v; } pxl8_u16x8; static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) { return (pxl8_f32x4){ _mm_set1_ps(x) }; } static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) { return (pxl8_f32x4){ _mm_set_ps(d, c, b, a) }; } static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_add_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_sub_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_mul_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_div_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_min_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_max(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_max_ps(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ _mm_cmplt_ps(a.v, b.v) }; } static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) { return _mm_movemask_ps(a.v); } static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) { return (pxl8_i32x4){ _mm_cvttps_epi32(a.v) }; } static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) { _mm_storeu_ps(out, a.v); } static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) { return (pxl8_i32x4){ _mm_set1_epi32(x) }; } static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) { return (pxl8_i32x4){ _mm_slli_epi32(a.v, n) }; } static inline pxl8_i32x4 pxl8_i32x4_srai(pxl8_i32x4 a, i32 n) { return (pxl8_i32x4){ _mm_srai_epi32(a.v, n) }; } static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){ _mm_and_si128(a.v, b.v) }; } static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){ _mm_or_si128(a.v, b.v) }; } static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) { _mm_storeu_si128((__m128i*)out, a.v); } static inline pxl8_u16x8 pxl8_u16x8_cmplt(pxl8_u16x8 a, pxl8_u16x8 b) { return (pxl8_u16x8){ _mm_cmplt_epi16(a.v, b.v) }; } static inline pxl8_u16x8 pxl8_u16x8_blend(pxl8_u16x8 a, pxl8_u16x8 b, pxl8_u16x8 mask) { __m128i not_mask = _mm_andnot_si128(mask.v, a.v); __m128i and_mask = _mm_and_si128(mask.v, b.v); return (pxl8_u16x8){ _mm_or_si128(not_mask, and_mask) }; } static inline i32 pxl8_u16x8_movemask(pxl8_u16x8 a) { return _mm_movemask_epi8(a.v); } #elif defined(PXL8_SIMD_NEON) typedef struct { float32x4_t v; } pxl8_f32x4; typedef struct { int32x4_t v; } pxl8_i32x4; typedef struct { uint16x8_t v; } pxl8_u16x8; static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) { return (pxl8_f32x4){ vdupq_n_f32(x) }; } static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) { f32 arr[4] = {a, b, c, d}; return (pxl8_f32x4){ vld1q_f32(arr) }; } static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vaddq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vsubq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vmulq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vdivq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vminq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_max(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){ vmaxq_f32(a.v, b.v) }; } static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) { uint32x4_t cmp = vcltq_f32(a.v, b.v); return (pxl8_f32x4){ vreinterpretq_f32_u32(cmp) }; } static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) { uint32x4_t input = vreinterpretq_u32_f32(a.v); static const i32 shifts[4] = {0, 1, 2, 3}; uint32x4_t shifted = vshrq_n_u32(input, 31); return vgetq_lane_u32(shifted, 0) | (vgetq_lane_u32(shifted, 1) << 1) | (vgetq_lane_u32(shifted, 2) << 2) | (vgetq_lane_u32(shifted, 3) << 3); } static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) { return (pxl8_i32x4){ vcvtq_s32_f32(a.v) }; } static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) { vst1q_f32(out, a.v); } static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) { return (pxl8_i32x4){ vdupq_n_s32(x) }; } static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) { return (pxl8_i32x4){ vshlq_s32(a.v, vdupq_n_s32(n)) }; } static inline pxl8_i32x4 pxl8_i32x4_srai(pxl8_i32x4 a, i32 n) { return (pxl8_i32x4){ vshlq_s32(a.v, vdupq_n_s32(-n)) }; } static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){ vandq_s32(a.v, b.v) }; } static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){ vorrq_s32(a.v, b.v) }; } static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) { vst1q_s32(out, a.v); } #else typedef struct { f32 v[4]; } pxl8_f32x4; typedef struct { i32 v[4]; } pxl8_i32x4; typedef struct { u16 v[8]; } pxl8_u16x8; static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) { return (pxl8_f32x4){{ x, x, x, x }}; } static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) { return (pxl8_f32x4){{ a, b, c, d }}; } static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){{ a.v[0]+b.v[0], a.v[1]+b.v[1], a.v[2]+b.v[2], a.v[3]+b.v[3] }}; } static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){{ a.v[0]-b.v[0], a.v[1]-b.v[1], a.v[2]-b.v[2], a.v[3]-b.v[3] }}; } static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){{ a.v[0]*b.v[0], a.v[1]*b.v[1], a.v[2]*b.v[2], a.v[3]*b.v[3] }}; } static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){{ a.v[0]/b.v[0], a.v[1]/b.v[1], a.v[2]/b.v[2], a.v[3]/b.v[3] }}; } static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) { return (pxl8_f32x4){{ a.v[0]b.v[0]?a.v[0]:b.v[0], a.v[1]>b.v[1]?a.v[1]:b.v[1], a.v[2]>b.v[2]?a.v[2]:b.v[2], a.v[3]>b.v[3]?a.v[3]:b.v[3] }}; } static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) { pxl8_f32x4 r; u32* rv = (u32*)r.v; rv[0] = a.v[0] < b.v[0] ? 0xFFFFFFFF : 0; rv[1] = a.v[1] < b.v[1] ? 0xFFFFFFFF : 0; rv[2] = a.v[2] < b.v[2] ? 0xFFFFFFFF : 0; rv[3] = a.v[3] < b.v[3] ? 0xFFFFFFFF : 0; return r; } static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) { u32* av = (u32*)a.v; return ((av[0] >> 31) & 1) | ((av[1] >> 31) & 1) << 1 | ((av[2] >> 31) & 1) << 2 | ((av[3] >> 31) & 1) << 3; } static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) { return (pxl8_i32x4){{ (i32)a.v[0], (i32)a.v[1], (i32)a.v[2], (i32)a.v[3] }}; } static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) { out[0] = a.v[0]; out[1] = a.v[1]; out[2] = a.v[2]; out[3] = a.v[3]; } static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) { return (pxl8_i32x4){{ x, x, x, x }}; } static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) { return (pxl8_i32x4){{ a.v[0]<>n, a.v[1]>>n, a.v[2]>>n, a.v[3]>>n }}; } static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){{ a.v[0]&b.v[0], a.v[1]&b.v[1], a.v[2]&b.v[2], a.v[3]&b.v[3] }}; } static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) { return (pxl8_i32x4){{ a.v[0]|b.v[0], a.v[1]|b.v[1], a.v[2]|b.v[2], a.v[3]|b.v[3] }}; } static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) { out[0] = a.v[0]; out[1] = a.v[1]; out[2] = a.v[2]; out[3] = a.v[3]; } #endif #ifdef __cplusplus } #endif