remove simd, scalar math + compiler optimizations are good enough

This commit is contained in:
asrael 2025-11-11 12:26:22 -06:00
parent e2c7998663
commit 4d84122ef3
8 changed files with 41 additions and 509 deletions

View file

@ -1,7 +1,6 @@
#include <math.h>
#include "pxl8_math.h"
#include "pxl8_simd.h"
pxl8_vec2 pxl8_vec2_add(pxl8_vec2 a, pxl8_vec2 b) {
return (pxl8_vec2){
@ -41,45 +40,31 @@ pxl8_vec2 pxl8_vec2_normalize(pxl8_vec2 v) {
}
pxl8_vec3 pxl8_vec3_add(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_add_f32(va, vb);
return (pxl8_vec3){
.x = result.f32_array[0],
.y = result.f32_array[1],
.z = result.f32_array[2],
.x = a.x + b.x,
.y = a.y + b.y,
.z = a.z + b.z,
};
}
pxl8_vec3 pxl8_vec3_sub(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_sub_f32(va, vb);
return (pxl8_vec3){
.x = result.f32_array[0],
.y = result.f32_array[1],
.z = result.f32_array[2],
.x = a.x - b.x,
.y = a.y - b.y,
.z = a.z - b.z,
};
}
pxl8_vec3 pxl8_vec3_scale(pxl8_vec3 v, f32 s) {
pxl8_simd_vec_f32 vv = pxl8_simd_set_f32(v.x, v.y, v.z, 0);
pxl8_simd_vec_f32 result = pxl8_simd_scale_f32(vv, s);
return (pxl8_vec3){
.x = result.f32_array[0],
.y = result.f32_array[1],
.z = result.f32_array[2],
.x = v.x * s,
.y = v.y * s,
.z = v.z * s,
};
}
f32 pxl8_vec3_dot(pxl8_vec3 a, pxl8_vec3 b) {
pxl8_simd_vec_f32 va = pxl8_simd_set_f32(a.x, a.y, a.z, 0);
pxl8_simd_vec_f32 vb = pxl8_simd_set_f32(b.x, b.y, b.z, 0);
return pxl8_simd_dot3_f32(va, vb);
return a.x * b.x + a.y * b.y + a.z * b.z;
}
pxl8_vec3 pxl8_vec3_cross(pxl8_vec3 a, pxl8_vec3 b) {
@ -115,13 +100,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
for (i32 i = 0; i < 4; i++) {
for (i32 j = 0; j < 4; j++) {
pxl8_simd_vec_f32 row = pxl8_simd_set_f32(
a.m[i * 4 + 0], a.m[i * 4 + 1], a.m[i * 4 + 2], a.m[i * 4 + 3]
);
pxl8_simd_vec_f32 col = pxl8_simd_set_f32(
b.m[0 * 4 + j], b.m[1 * 4 + j], b.m[2 * 4 + j], b.m[3 * 4 + j]
);
mat.m[i * 4 + j] = pxl8_simd_dot4_f32(row, col);
mat.m[i * 4 + j] =
a.m[i * 4 + 0] * b.m[0 * 4 + j] +
a.m[i * 4 + 1] * b.m[1 * 4 + j] +
a.m[i * 4 + 2] * b.m[2 * 4 + j] +
a.m[i * 4 + 3] * b.m[3 * 4 + j];
}
}
@ -129,17 +112,11 @@ pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) {
}
pxl8_vec4 pxl8_mat4_multiply_vec4(pxl8_mat4 m, pxl8_vec4 v) {
pxl8_simd_vec_f32 row0 = pxl8_simd_set_f32(m.m[0], m.m[1], m.m[2], m.m[3]);
pxl8_simd_vec_f32 row1 = pxl8_simd_set_f32(m.m[4], m.m[5], m.m[6], m.m[7]);
pxl8_simd_vec_f32 row2 = pxl8_simd_set_f32(m.m[8], m.m[9], m.m[10], m.m[11]);
pxl8_simd_vec_f32 row3 = pxl8_simd_set_f32(m.m[12], m.m[13], m.m[14], m.m[15]);
pxl8_simd_vec_f32 vec = pxl8_simd_set_f32(v.x, v.y, v.z, v.w);
return (pxl8_vec4){
.x = pxl8_simd_dot4_f32(row0, vec),
.y = pxl8_simd_dot4_f32(row1, vec),
.z = pxl8_simd_dot4_f32(row2, vec),
.w = pxl8_simd_dot4_f32(row3, vec),
.x = m.m[0] * v.x + m.m[1] * v.y + m.m[2] * v.z + m.m[3] * v.w,
.y = m.m[4] * v.x + m.m[5] * v.y + m.m[6] * v.z + m.m[7] * v.w,
.z = m.m[8] * v.x + m.m[9] * v.y + m.m[10] * v.z + m.m[11] * v.w,
.w = m.m[12] * v.x + m.m[13] * v.y + m.m[14] * v.z + m.m[15] * v.w,
};
}