291 lines
8.5 KiB
C
291 lines
8.5 KiB
C
#pragma once
|
|
|
|
#include "pxl8_types.h"
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
#define PXL8_SIMD_SSE2 1
|
|
#include <emmintrin.h>
|
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
|
#define PXL8_SIMD_NEON 1
|
|
#include <arm_neon.h>
|
|
#else
|
|
#define PXL8_SIMD_SCALAR 1
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#if defined(PXL8_SIMD_SSE2)
|
|
|
|
typedef struct { __m128 v; } pxl8_f32x4;
|
|
typedef struct { __m128i v; } pxl8_i32x4;
|
|
typedef struct { __m128i v; } pxl8_u16x8;
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) {
|
|
return (pxl8_f32x4){ _mm_set1_ps(x) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) {
|
|
return (pxl8_f32x4){ _mm_set_ps(d, c, b, a) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_add_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_sub_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_mul_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_div_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_min_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_max(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_max_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ _mm_cmplt_ps(a.v, b.v) };
|
|
}
|
|
|
|
static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) {
|
|
return _mm_movemask_ps(a.v);
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) {
|
|
return (pxl8_i32x4){ _mm_cvttps_epi32(a.v) };
|
|
}
|
|
|
|
static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) {
|
|
_mm_storeu_ps(out, a.v);
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) {
|
|
return (pxl8_i32x4){ _mm_set1_epi32(x) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){ _mm_slli_epi32(a.v, n) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_srai(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){ _mm_srai_epi32(a.v, n) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){ _mm_and_si128(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){ _mm_or_si128(a.v, b.v) };
|
|
}
|
|
|
|
static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) {
|
|
_mm_storeu_si128((__m128i*)out, a.v);
|
|
}
|
|
|
|
static inline pxl8_u16x8 pxl8_u16x8_cmplt(pxl8_u16x8 a, pxl8_u16x8 b) {
|
|
return (pxl8_u16x8){ _mm_cmplt_epi16(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_u16x8 pxl8_u16x8_blend(pxl8_u16x8 a, pxl8_u16x8 b, pxl8_u16x8 mask) {
|
|
__m128i not_mask = _mm_andnot_si128(mask.v, a.v);
|
|
__m128i and_mask = _mm_and_si128(mask.v, b.v);
|
|
return (pxl8_u16x8){ _mm_or_si128(not_mask, and_mask) };
|
|
}
|
|
|
|
static inline i32 pxl8_u16x8_movemask(pxl8_u16x8 a) {
|
|
return _mm_movemask_epi8(a.v);
|
|
}
|
|
|
|
#elif defined(PXL8_SIMD_NEON)
|
|
|
|
typedef struct { float32x4_t v; } pxl8_f32x4;
|
|
typedef struct { int32x4_t v; } pxl8_i32x4;
|
|
typedef struct { uint16x8_t v; } pxl8_u16x8;
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) {
|
|
return (pxl8_f32x4){ vdupq_n_f32(x) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) {
|
|
f32 arr[4] = {a, b, c, d};
|
|
return (pxl8_f32x4){ vld1q_f32(arr) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vaddq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vsubq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vmulq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vdivq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vminq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_max(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){ vmaxq_f32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
uint32x4_t cmp = vcltq_f32(a.v, b.v);
|
|
return (pxl8_f32x4){ vreinterpretq_f32_u32(cmp) };
|
|
}
|
|
|
|
static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) {
|
|
uint32x4_t input = vreinterpretq_u32_f32(a.v);
|
|
static const i32 shifts[4] = {0, 1, 2, 3};
|
|
uint32x4_t shifted = vshrq_n_u32(input, 31);
|
|
return vgetq_lane_u32(shifted, 0) | (vgetq_lane_u32(shifted, 1) << 1) |
|
|
(vgetq_lane_u32(shifted, 2) << 2) | (vgetq_lane_u32(shifted, 3) << 3);
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) {
|
|
return (pxl8_i32x4){ vcvtq_s32_f32(a.v) };
|
|
}
|
|
|
|
static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) {
|
|
vst1q_f32(out, a.v);
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) {
|
|
return (pxl8_i32x4){ vdupq_n_s32(x) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){ vshlq_s32(a.v, vdupq_n_s32(n)) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_srai(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){ vshlq_s32(a.v, vdupq_n_s32(-n)) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){ vandq_s32(a.v, b.v) };
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){ vorrq_s32(a.v, b.v) };
|
|
}
|
|
|
|
static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) {
|
|
vst1q_s32(out, a.v);
|
|
}
|
|
|
|
#else
|
|
|
|
typedef struct { f32 v[4]; } pxl8_f32x4;
|
|
typedef struct { i32 v[4]; } pxl8_i32x4;
|
|
typedef struct { u16 v[8]; } pxl8_u16x8;
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_splat(f32 x) {
|
|
return (pxl8_f32x4){{ x, x, x, x }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_new(f32 a, f32 b, f32 c, f32 d) {
|
|
return (pxl8_f32x4){{ a, b, c, d }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_add(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{ a.v[0]+b.v[0], a.v[1]+b.v[1], a.v[2]+b.v[2], a.v[3]+b.v[3] }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_sub(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{ a.v[0]-b.v[0], a.v[1]-b.v[1], a.v[2]-b.v[2], a.v[3]-b.v[3] }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_mul(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{ a.v[0]*b.v[0], a.v[1]*b.v[1], a.v[2]*b.v[2], a.v[3]*b.v[3] }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_div(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{ a.v[0]/b.v[0], a.v[1]/b.v[1], a.v[2]/b.v[2], a.v[3]/b.v[3] }};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_min(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{
|
|
a.v[0]<b.v[0]?a.v[0]:b.v[0], a.v[1]<b.v[1]?a.v[1]:b.v[1],
|
|
a.v[2]<b.v[2]?a.v[2]:b.v[2], a.v[3]<b.v[3]?a.v[3]:b.v[3]
|
|
}};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_max(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
return (pxl8_f32x4){{
|
|
a.v[0]>b.v[0]?a.v[0]:b.v[0], a.v[1]>b.v[1]?a.v[1]:b.v[1],
|
|
a.v[2]>b.v[2]?a.v[2]:b.v[2], a.v[3]>b.v[3]?a.v[3]:b.v[3]
|
|
}};
|
|
}
|
|
|
|
static inline pxl8_f32x4 pxl8_f32x4_cmplt(pxl8_f32x4 a, pxl8_f32x4 b) {
|
|
pxl8_f32x4 r;
|
|
u32* rv = (u32*)r.v;
|
|
rv[0] = a.v[0] < b.v[0] ? 0xFFFFFFFF : 0;
|
|
rv[1] = a.v[1] < b.v[1] ? 0xFFFFFFFF : 0;
|
|
rv[2] = a.v[2] < b.v[2] ? 0xFFFFFFFF : 0;
|
|
rv[3] = a.v[3] < b.v[3] ? 0xFFFFFFFF : 0;
|
|
return r;
|
|
}
|
|
|
|
static inline i32 pxl8_f32x4_movemask(pxl8_f32x4 a) {
|
|
u32* av = (u32*)a.v;
|
|
return ((av[0] >> 31) & 1) | ((av[1] >> 31) & 1) << 1 |
|
|
((av[2] >> 31) & 1) << 2 | ((av[3] >> 31) & 1) << 3;
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_f32x4_to_i32x4(pxl8_f32x4 a) {
|
|
return (pxl8_i32x4){{ (i32)a.v[0], (i32)a.v[1], (i32)a.v[2], (i32)a.v[3] }};
|
|
}
|
|
|
|
static inline void pxl8_f32x4_store(pxl8_f32x4 a, f32* out) {
|
|
out[0] = a.v[0]; out[1] = a.v[1]; out[2] = a.v[2]; out[3] = a.v[3];
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_splat(i32 x) {
|
|
return (pxl8_i32x4){{ x, x, x, x }};
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_slli(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){{ a.v[0]<<n, a.v[1]<<n, a.v[2]<<n, a.v[3]<<n }};
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_srai(pxl8_i32x4 a, i32 n) {
|
|
return (pxl8_i32x4){{ a.v[0]>>n, a.v[1]>>n, a.v[2]>>n, a.v[3]>>n }};
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_and(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){{ a.v[0]&b.v[0], a.v[1]&b.v[1], a.v[2]&b.v[2], a.v[3]&b.v[3] }};
|
|
}
|
|
|
|
static inline pxl8_i32x4 pxl8_i32x4_or(pxl8_i32x4 a, pxl8_i32x4 b) {
|
|
return (pxl8_i32x4){{ a.v[0]|b.v[0], a.v[1]|b.v[1], a.v[2]|b.v[2], a.v[3]|b.v[3] }};
|
|
}
|
|
|
|
static inline void pxl8_i32x4_store(pxl8_i32x4 a, i32* out) {
|
|
out[0] = a.v[0]; out[1] = a.v[1]; out[2] = a.v[2]; out[3] = a.v[3];
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|