diff --git a/demo/mod/first_person3d.fnl b/demo/mod/first_person3d.fnl index c0051dd..fefa7ec 100644 --- a/demo/mod/first_person3d.fnl +++ b/demo/mod/first_person3d.fnl @@ -349,12 +349,11 @@ r2 (* 0.04 (math.sin (+ (* real-time 3.2) phase))) light-radius (* 150 (+ 0.95 r1 r2))] (lights:clear) - (lights:add light-x light-y light-z 2 light-intensity light-radius) + (lights:add light-x light-y light-z 0xFFB888 light-intensity light-radius) (pxl8.push_target) (pxl8.begin_frame_3d camera lights { :ambient 25 - :dither true :fog_density 0.0 :celestial_dir [0.5 -0.8 0.3] :celestial_intensity 0.3}) diff --git a/pxl8d/build.rs b/pxl8d/build.rs index d3875d8..2ff503d 100644 --- a/pxl8d/build.rs +++ b/pxl8d/build.rs @@ -53,11 +53,8 @@ fn main() { .blocklist_type("pxl8_vec3") .blocklist_type("pxl8_vec4") .blocklist_type("pxl8_mat4") - .blocklist_item(".*_simd.*") - .blocklist_item("PXL8_SIMD.*") - .blocklist_type("__m128.*") - .blocklist_type(".*32x4_t|.*16x8_t") .raw_line("pub use crate::math::{pxl8_vec2, pxl8_vec3, pxl8_vec4, pxl8_mat4};") + .clang_arg("-DPXL8_NO_SIMD") .use_core() .rustified_enum(".*") .generate() diff --git a/pxl8d/src/bsp.rs b/pxl8d/src/bsp.rs index c414f3b..8d36245 100644 --- a/pxl8d/src/bsp.rs +++ b/pxl8d/src/bsp.rs @@ -3,7 +3,7 @@ extern crate alloc; use alloc::boxed::Box; use alloc::vec::Vec; -use crate::math::Vec3; +use crate::math::{Vec3, VEC3_ZERO}; use crate::pxl8::*; pub type Vertex = pxl8_bsp_vertex; @@ -31,8 +31,8 @@ impl Default for Face { side: 0, styles: [0; 4], material_id: 0, - aabb_min: Vec3::ZERO, - aabb_max: Vec3::ZERO, + aabb_min: VEC3_ZERO, + aabb_max: VEC3_ZERO, } } } @@ -68,7 +68,7 @@ impl Default for Plane { fn default() -> Self { Self { dist: 0.0, - normal: Vec3::ZERO, + normal: VEC3_ZERO, type_: 0, } } @@ -76,7 +76,7 @@ impl Default for Plane { impl Default for Vertex { fn default() -> Self { - Self { position: Vec3::ZERO } + Self { position: VEC3_ZERO } } } diff --git a/pxl8d/src/math.rs b/pxl8d/src/math.rs index d1f871c..250ec0f 100644 --- a/pxl8d/src/math.rs +++ b/pxl8d/src/math.rs @@ -1,14 +1,14 @@ use core::ops::{Add, Mul, Sub}; #[repr(C)] -#[derive(Debug, Copy, Clone, Default)] +#[derive(Debug, Copy, Clone)] pub struct Vec2 { pub x: f32, pub y: f32, } #[repr(C)] -#[derive(Debug, Copy, Clone, Default)] +#[derive(Debug, Copy, Clone)] pub struct Vec3 { pub x: f32, pub y: f32, @@ -16,7 +16,7 @@ pub struct Vec3 { } #[repr(C)] -#[derive(Debug, Copy, Clone, Default)] +#[derive(Debug, Copy, Clone)] pub struct Vec4 { pub x: f32, pub y: f32, @@ -25,7 +25,7 @@ pub struct Vec4 { } #[repr(C)] -#[derive(Debug, Copy, Clone, Default)] +#[derive(Debug, Copy, Clone)] pub struct Mat4 { pub m: [f32; 16], } @@ -39,40 +39,31 @@ pub type pxl8_vec4 = Vec4; #[allow(non_camel_case_types)] pub type pxl8_mat4 = Mat4; -impl Vec3 { - pub const ZERO: Vec3 = Vec3 { x: 0.0, y: 0.0, z: 0.0 }; - pub const Y: Vec3 = Vec3 { x: 0.0, y: 1.0, z: 0.0 }; +pub const VEC3_ZERO: Vec3 = Vec3 { x: 0.0, y: 0.0, z: 0.0 }; +pub const VEC3_Y: Vec3 = Vec3 { x: 0.0, y: 1.0, z: 0.0 }; - pub fn new(x: f32, y: f32, z: f32) -> Self { +pub trait Vec3Ext { + fn new(x: f32, y: f32, z: f32) -> Self; + fn dot(self, rhs: Self) -> f32; +} + +impl Vec3Ext for pxl8_vec3 { + fn new(x: f32, y: f32, z: f32) -> Self { Self { x, y, z } } - pub fn dot(self, rhs: Self) -> f32 { + fn dot(self, rhs: Self) -> f32 { self.x * rhs.x + self.y * rhs.y + self.z * rhs.z } - - pub fn cross(self, rhs: Self) -> Self { - Self { - x: self.y * rhs.z - self.z * rhs.y, - y: self.z * rhs.x - self.x * rhs.z, - z: self.x * rhs.y - self.y * rhs.x, - } - } - - pub fn length(self) -> f32 { - libm::sqrtf(self.dot(self)) - } - - pub fn normalize(self) -> Self { - let len_sq = self.dot(self); - if len_sq < 1e-12 { - return Self::ZERO; - } - self * (1.0 / libm::sqrtf(len_sq)) - } } -impl Add for Vec3 { +impl Default for pxl8_vec3 { + fn default() -> Self { + VEC3_ZERO + } +} + +impl Add for pxl8_vec3 { type Output = Self; fn add(self, rhs: Self) -> Self { Self { @@ -83,7 +74,7 @@ impl Add for Vec3 { } } -impl Sub for Vec3 { +impl Sub for pxl8_vec3 { type Output = Self; fn sub(self, rhs: Self) -> Self { Self { @@ -94,7 +85,7 @@ impl Sub for Vec3 { } } -impl Mul for Vec3 { +impl Mul for pxl8_vec3 { type Output = Self; fn mul(self, rhs: f32) -> Self { Self { diff --git a/pxl8d/src/procgen.rs b/pxl8d/src/procgen.rs index f6fb97e..1df3441 100644 --- a/pxl8d/src/procgen.rs +++ b/pxl8d/src/procgen.rs @@ -5,7 +5,8 @@ use alloc::vec::Vec; use libm::sqrtf; use crate::bsp::{Bsp, BspBuilder, CellPortals, Edge, Face, Leaf, Node, Plane, Portal, Vertex}; -use crate::math::Vec3; +use crate::math::{Vec3, Vec3Ext}; +use crate::pxl8::{pxl8_vec3_cross, pxl8_vec3_dot, pxl8_vec3_normalize, pxl8_vec3_scale, pxl8_vec3_add, pxl8_vec3_sub}; pub const CELL_SIZE: f32 = 64.0; pub const WALL_HEIGHT: f32 = 128.0; @@ -407,11 +408,11 @@ const AO_RAY_LENGTH: f32 = 48.0; fn generate_hemisphere_samples(normal: Vec3) -> [Vec3; AO_NUM_SAMPLES] { let tangent = if normal.y.abs() < 0.9 { - normal.cross(Vec3::new(0.0, 1.0, 0.0)).normalize() + unsafe { pxl8_vec3_normalize(pxl8_vec3_cross(normal, Vec3::new(0.0, 1.0, 0.0))) } } else { - normal.cross(Vec3::new(1.0, 0.0, 0.0)).normalize() + unsafe { pxl8_vec3_normalize(pxl8_vec3_cross(normal, Vec3::new(1.0, 0.0, 0.0))) } }; - let bitangent = normal.cross(tangent); + let bitangent = unsafe { pxl8_vec3_cross(normal, tangent) }; let mut samples = [Vec3::new(0.0, 0.0, 0.0); AO_NUM_SAMPLES]; for i in 0..AO_NUM_SAMPLES { @@ -424,44 +425,46 @@ fn generate_hemisphere_samples(normal: Vec3) -> [Vec3; AO_NUM_SAMPLES] { let local_y = cos_theta; let local_z = sin_theta * sin_phi; - let t_contrib = tangent * local_x; - let n_contrib = normal * local_y; - let b_contrib = bitangent * local_z; - samples[i] = (t_contrib + n_contrib + b_contrib).normalize(); + unsafe { + let t_contrib = pxl8_vec3_scale(tangent, local_x); + let n_contrib = pxl8_vec3_scale(normal, local_y); + let b_contrib = pxl8_vec3_scale(bitangent, local_z); + samples[i] = pxl8_vec3_normalize(pxl8_vec3_add(pxl8_vec3_add(t_contrib, n_contrib), b_contrib)); + } } samples } fn ray_triangle_intersect(origin: Vec3, dir: Vec3, v0: Vec3, v1: Vec3, v2: Vec3, max_dist: f32) -> bool { - let edge1 = v1 - v0; - let edge2 = v2 - v0; - let h = dir.cross(edge2); - let a = edge1.dot(h); + let edge1 = unsafe { pxl8_vec3_sub(v1, v0) }; + let edge2 = unsafe { pxl8_vec3_sub(v2, v0) }; + let h = unsafe { pxl8_vec3_cross(dir, edge2) }; + let a = unsafe { pxl8_vec3_dot(edge1, h) }; if a > -0.0001 && a < 0.0001 { return false; } let f = 1.0 / a; - let s = origin - v0; - let u = f * s.dot(h); + let s = unsafe { pxl8_vec3_sub(origin, v0) }; + let u = f * unsafe { pxl8_vec3_dot(s, h) }; if u < 0.0 || u > 1.0 { return false; } - let q = s.cross(edge1); - let v = f * dir.dot(q); + let q = unsafe { pxl8_vec3_cross(s, edge1) }; + let v = f * unsafe { pxl8_vec3_dot(dir, q) }; if v < 0.0 || u + v > 1.0 { return false; } - let t = f * edge2.dot(q); + let t = f * unsafe { pxl8_vec3_dot(edge2, q) }; t > 0.001 && t < max_dist } fn compute_vertex_ao(bsp: &BspBuilder, pos: Vec3, normal: Vec3) -> f32 { let samples = generate_hemisphere_samples(normal); - let offset_pos = pos + normal * 0.5; + let offset_pos = unsafe { pxl8_vec3_add(pos, pxl8_vec3_scale(normal, 0.5)) }; let mut occluded = 0; @@ -523,15 +526,16 @@ fn compute_vertex_light( let mut total = 0.0; for light in lights { - let to_light = light.position - pos; - let dist = sqrtf(to_light.dot(to_light)).max(1.0); + let to_light = unsafe { pxl8_vec3_sub(light.position, pos) }; + let dist = unsafe { pxl8_vec3_dot(to_light, to_light) }; + let dist = sqrtf(dist).max(1.0); if dist > light.radius { continue; } - let light_dir = to_light.normalize(); - let ndotl = normal.dot(light_dir).max(0.0); + let light_dir = unsafe { pxl8_vec3_normalize(to_light) }; + let ndotl = unsafe { pxl8_vec3_dot(normal, light_dir) }.max(0.0); let attenuation = (1.0 - dist / light.radius).max(0.0); let attenuation = attenuation * attenuation; diff --git a/pxl8d/src/sim.rs b/pxl8d/src/sim.rs index 4494a90..e916861 100644 --- a/pxl8d/src/sim.rs +++ b/pxl8d/src/sim.rs @@ -2,7 +2,7 @@ extern crate alloc; use alloc::vec::Vec; -use crate::math::Vec3; +use crate::math::{Vec3, Vec3Ext, VEC3_ZERO}; use crate::pxl8::*; use crate::voxel::VoxelWorld; use crate::world::World; @@ -17,8 +17,8 @@ const MAX_ENTITIES: usize = 1024; impl Default for Entity { fn default() -> Self { Self { - pos: Vec3::ZERO, - vel: Vec3::ZERO, + pos: VEC3_ZERO, + vel: VEC3_ZERO, yaw: 0.0, pitch: 0.0, flags: 0, diff --git a/src/core/pxl8.c b/src/core/pxl8.c index bf3b9a8..ac06e7d 100644 --- a/src/core/pxl8.c +++ b/src/core/pxl8.c @@ -340,7 +340,6 @@ pxl8_result pxl8_update(pxl8* sys) { if (game->fps_accumulator >= 1.0f) { game->fps = (f32)game->fps_frame_count / game->fps_accumulator; - pxl8_info("FPS: %.1f", game->fps); game->fps_accumulator = 0.0f; game->fps_frame_count = 0; } diff --git a/src/gfx/pxl8_colormap.h b/src/gfx/pxl8_colormap.h index 584e448..fb9f714 100644 --- a/src/gfx/pxl8_colormap.h +++ b/src/gfx/pxl8_colormap.h @@ -37,7 +37,7 @@ typedef struct { static const pxl8_rgb pxl8_light_colors[PXL8_LIGHT_COLORS] = { {255, 255, 255}, {255, 64, 64}, - {255, 192, 64}, + {255, 160, 64}, {255, 255, 64}, {64, 255, 64}, {64, 255, 255}, diff --git a/src/gfx/pxl8_lights.c b/src/gfx/pxl8_lights.c index de3d304..17bffc5 100644 --- a/src/gfx/pxl8_lights.c +++ b/src/gfx/pxl8_lights.c @@ -33,16 +33,51 @@ void pxl8_lights_destroy(pxl8_lights* lights) { pxl8_free(lights); } -void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 color, u8 intensity, f32 radius) { +void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 r, u8 g, u8 b, u8 intensity, f32 radius) { if (!lights || lights->count >= lights->capacity) return; f32 radius_sq = radius * radius; pxl8_light* l = &lights->data[lights->count++]; - l->color = color; - l->intensity = (f32)intensity / 255.0f; - l->inv_radius_sq = radius_sq > 0.0f ? 1.0f / radius_sq : 0.0f; - l->position = (pxl8_vec3){{x, y, z}}; + l->position.x = x; + l->position.y = y; + l->position.z = z; + l->r = r; + l->g = g; + l->b = b; + l->intensity = intensity; + l->radius = radius; l->radius_sq = radius_sq; + l->inv_radius_sq = radius_sq > 0.0f ? 1.0f / radius_sq : 0.0f; + + l->constant = 1.0f; + if (radius <= 7.0f) { + l->linear = 0.7f; + l->quadratic = 1.8f; + } else if (radius <= 13.0f) { + l->linear = 0.35f; + l->quadratic = 0.44f; + } else if (radius <= 20.0f) { + l->linear = 0.22f; + l->quadratic = 0.20f; + } else if (radius <= 32.0f) { + l->linear = 0.14f; + l->quadratic = 0.07f; + } else if (radius <= 50.0f) { + l->linear = 0.09f; + l->quadratic = 0.032f; + } else if (radius <= 65.0f) { + l->linear = 0.07f; + l->quadratic = 0.017f; + } else if (radius <= 100.0f) { + l->linear = 0.045f; + l->quadratic = 0.0075f; + } else if (radius <= 160.0f) { + l->linear = 0.027f; + l->quadratic = 0.0028f; + } else { + l->linear = 0.022f; + l->quadratic = 0.0019f; + } } void pxl8_lights_clear(pxl8_lights* lights) { diff --git a/src/gfx/pxl8_lights.h b/src/gfx/pxl8_lights.h index 15d861a..2be0c6b 100644 --- a/src/gfx/pxl8_lights.h +++ b/src/gfx/pxl8_lights.h @@ -6,11 +6,15 @@ #define PXL8_LIGHTS_MAX 256 typedef struct pxl8_light { - u8 color; - f32 intensity; - f32 inv_radius_sq; pxl8_vec3 position; + f32 inv_radius_sq; + u8 r, g, b; + u8 intensity; + f32 radius; f32 radius_sq; + f32 constant; + f32 linear; + f32 quadratic; } pxl8_light; typedef struct pxl8_lights pxl8_lights; @@ -22,7 +26,7 @@ extern "C" { pxl8_lights* pxl8_lights_create(u32 capacity); void pxl8_lights_destroy(pxl8_lights* lights); -void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 color, u8 intensity, f32 radius); +void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 r, u8 g, u8 b, u8 intensity, f32 radius); void pxl8_lights_clear(pxl8_lights* lights); u32 pxl8_lights_count(const pxl8_lights* lights); const pxl8_light* pxl8_lights_data(const pxl8_lights* lights); diff --git a/src/gfx/pxl8_render.c b/src/gfx/pxl8_render.c index 0135da8..3a5e355 100644 --- a/src/gfx/pxl8_render.c +++ b/src/gfx/pxl8_render.c @@ -296,6 +296,7 @@ static void rasterize_triangle( const tri_setup* setup, u8* fb, u16* zb, + u32* light_accum, u32 fb_width, pxl8_shader_fn shader, const pxl8_gfx_pipeline_desc* pipeline, @@ -424,6 +425,7 @@ static void rasterize_triangle( u32 row_start = (u32)y * fb_width; u8* prow = fb + row_start; u16* zrow = zb + row_start; + u32* lrow = light_accum ? light_accum + row_start : NULL; i32 x = x_start; while (x <= x_end) { @@ -431,8 +433,8 @@ static void rasterize_triangle( if (span_end > x_end) span_end = x_end; i32 span_len = span_end - x + 1; - f32 pw_start = 1.0f / wr; - f32 pw_end = 1.0f / (wr + dwr * (f32)span_len); + f32 pw_start = pxl8_fast_rcp(wr); + f32 pw_end = pxl8_fast_rcp(wr + dwr * (f32)span_len); f32 u_start = uw * pw_start; f32 v_start = vw * pw_start; @@ -469,80 +471,7 @@ static void rasterize_triangle( f32 wy_a = wy_start; f32 wz_a = wz_start; - i32 px = x; - -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) - if (depth_test && depth_compare == PXL8_GFX_COMPARE_LESS && !blend_enabled) { - pxl8_f32_simd dz4_simd = pxl8_f32_simd_set(dz * 4.0f); - pxl8_f32_simd half = pxl8_f32_simd_set(0.5f); - pxl8_f32_simd one = pxl8_f32_simd_set(1.0f); - pxl8_f32_simd zero = pxl8_f32_simd_zero(); - pxl8_f32_simd scale65535 = pxl8_f32_simd_set(65535.0f); - pxl8_f32_simd z4 = pxl8_f32_simd_set4(z_a, z_a + dz, z_a + dz * 2.0f, z_a + dz * 3.0f); - - pxl8_f32_simd offsets = pxl8_f32_simd_set4(0.0f, 1.0f, 2.0f, 3.0f); - f32 du4 = du * 4.0f, dv4 = dv * 4.0f, dl4 = dl * 4.0f, dc4 = dc * 4.0f; - f32 dz4 = dz * 4.0f, dwx4 = dwx * 4.0f, dwy4 = dwy * 4.0f, dwz4 = dwz * 4.0f; - - for (; px + 3 <= span_end; px += 4) { - pxl8_f32_simd depth_norm = pxl8_f32_simd_clamp(pxl8_f32_simd_mul(pxl8_f32_simd_add(z4, one), half), zero, one); - pxl8_i32_simd z16_4 = pxl8_f32_simd_to_i32(pxl8_f32_simd_mul(depth_norm, scale65535)); - pxl8_i32_simd zbuf = pxl8_i32_simd_set4((i32)zrow[px], (i32)zrow[px+1], (i32)zrow[px+2], (i32)zrow[px+3]); - i32 mask = pxl8_i32_simd_movemask(pxl8_i32_simd_cmpgt(zbuf, z16_4)); - - STATS_INC(stats, depth_tests, 4); - - if (mask) { - pxl8_shader_ctx frag_ctx = { - .color_count = 4, - .x = pxl8_i32_simd_set4(px, px + 1, px + 2, px + 3), - .y = pxl8_i32_simd_set(y), - .v_uv = { - pxl8_f32_simd_add(pxl8_f32_simd_set(u_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(du), offsets)), - pxl8_f32_simd_add(pxl8_f32_simd_set(v_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dv), offsets)) - }, - .v_world = { - pxl8_f32_simd_add(pxl8_f32_simd_set(wx_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwx), offsets)), - pxl8_f32_simd_add(pxl8_f32_simd_set(wy_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwy), offsets)), - pxl8_f32_simd_add(pxl8_f32_simd_set(wz_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwz), offsets)) - }, - .v_normal = pxl8_vec3_simd_set(setup->normal), - .v_light = pxl8_f32_simd_mul( - pxl8_f32_simd_add(pxl8_f32_simd_set(l_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dl), offsets)), - pxl8_f32_simd_set(1.0f / 255.0f) - ), - .v_color = pxl8_f32_simd_add(pxl8_f32_simd_set(c_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dc), offsets)), - .v_depth = z4, - }; - - u8 colors[4]; - shader(&frag_ctx, bindings, uniforms, colors); - STATS_INC(stats, shader_calls, 1); - - i32 z16_arr[4]; - pxl8_i32_simd_store(z16_arr, z16_4); - - for (i32 i = 0; i < 4; i++) { - if (!(mask & (0x8 << (i * 4)))) continue; - STATS_INC(stats, depth_passes, 1); - - u8 color = colors[i]; - if (!(alpha_test && color <= alpha_ref) && color != 0) { - prow[px + i] = color; - if (depth_write) zrow[px + i] = (u16)z16_arr[i]; - STATS_INC(stats, pixels_written, 1); - } - } - } - - u_a += du4; v_a += dv4; l_a += dl4; c_a += dc4; - z_a += dz4; wx_a += dwx4; wy_a += dwy4; wz_a += dwz4; - z4 = pxl8_f32_simd_add(z4, dz4_simd); - } - } -#endif - - for (; px <= span_end; px++) { + for (i32 px = x; px <= span_end; px++) { f32 depth_norm = pxl8_clamp((z_a + 1.0f) * 0.5f, 0.0f, 1.0f); u16 z16 = (u16)(depth_norm * 65535.0f); @@ -551,19 +480,18 @@ static void rasterize_triangle( if (depth_pass) { STATS_INC(stats, depth_passes, 1); pxl8_shader_ctx frag_ctx = { - .color_count = 1, - .x = pxl8_i32_simd_set(px), - .y = pxl8_i32_simd_set(y), - .v_uv = { pxl8_f32_simd_set(u_a), pxl8_f32_simd_set(v_a) }, - .v_world = { pxl8_f32_simd_set(wx_a), pxl8_f32_simd_set(wy_a), pxl8_f32_simd_set(wz_a) }, - .v_normal = pxl8_vec3_simd_set(setup->normal), - .v_light = pxl8_f32_simd_set(l_a / 255.0f), - .v_color = pxl8_f32_simd_set(c_a), - .v_depth = pxl8_f32_simd_set(z_a), + .x = px, + .y = y, + .v_uv = { { u_a, v_a } }, + .v_world = { wx_a, wy_a, wz_a }, + .v_normal = setup->normal, + .v_light = l_a / 255.0f, + .v_color = c_a, + .v_depth = z_a, + .out_light_color = 0, }; - u8 color; - shader(&frag_ctx, bindings, uniforms, &color); + u8 color = shader(&frag_ctx, bindings, uniforms); STATS_INC(stats, shader_calls, 1); if (!(alpha_test && color <= alpha_ref)) { @@ -578,6 +506,10 @@ static void rasterize_triangle( zrow[px] = z16; } STATS_INC(stats, pixels_written, 1); + if (lrow && frag_ctx.out_light_color != 0) { + lrow[px] = frag_ctx.out_light_color; + STATS_INC(stats, light_writes, 1); + } } } } @@ -1148,6 +1080,12 @@ static void execute_draw( return; } + u32* light_accum = NULL; + if (VALID_TEX(r, pass->desc.light_accum.texture)) { + texture_slot* light_tex = &r->textures[SLOT_INDEX(pass->desc.light_accum.texture.id)]; + light_accum = light_tex->data; + } + const pxl8_vertex* vertices = vb->data; const u16* indices = use_indices ? ib->data : NULL; @@ -1301,7 +1239,7 @@ static void execute_draw( } u64 raster_start = STATS_START(); - rasterize_triangle(&setup, fb, zb, fb_w, shader, &pip->desc, + rasterize_triangle(&setup, fb, zb, light_accum, fb_w, shader, &pip->desc, &shader_bindings, &shader_uniforms, &r->stats); STATS_ADD(&r->stats, raster_ns, raster_start); } @@ -1524,27 +1462,41 @@ void pxl8_resolve_to_rgba(pxl8_renderer* r, pxl8_gfx_texture color, pxl8_gfx_tex u8* fb = cs->data; u32 w = cs->width; u32 h = cs->height; - u32 total = w * h; - (void)light_accum; + u32* light_data = NULL; + if (light_accum.id != PXL8_GFX_INVALID_ID && VALID_TEX(r, light_accum)) { + light_data = r->textures[SLOT_INDEX(light_accum.id)].data; + } -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) - pxl8_i32_simd alpha_mask = pxl8_i32_simd_set((i32)0xFF000000); - u32 i = 0; - for (; i + 4 <= total; i += 4) { - pxl8_i32_simd base = pxl8_i32_simd_set4( - (i32)palette[fb[i + 0]], (i32)palette[fb[i + 1]], - (i32)palette[fb[i + 2]], (i32)palette[fb[i + 3]] - ); - base = pxl8_i32_simd_or(base, alpha_mask); - pxl8_i32_simd_store((i32*)&output[i], base); + for (u32 i = 0; i < w * h; i++) { + u8 idx = fb[i]; + u32 base = palette[idx]; + + if (light_data) { + u32 lv = light_data[i]; + u32 la = lv >> 24; + if (la > 0) { + i32 br = base & 0xFF; + i32 bg = (base >> 8) & 0xFF; + i32 bb = (base >> 16) & 0xFF; + + i32 lr = lv & 0xFF; + i32 lg = (lv >> 8) & 0xFF; + i32 lb = (lv >> 16) & 0xFF; + + f32 t = (f32)la / 255.0f; + br += (i32)((f32)(lr - 128) * t * 2.0f); + bg += (i32)((f32)(lg - 128) * t * 2.0f); + bb += (i32)((f32)(lb - 128) * t * 2.0f); + + br = pxl8_clamp_byte(br); + bg = pxl8_clamp_byte(bg); + bb = pxl8_clamp_byte(bb); + + base = (u32)br | ((u32)bg << 8) | ((u32)bb << 16) | 0xFF000000; + } + } + + output[i] = base | 0xFF000000; } - for (; i < total; i++) { - output[i] = palette[fb[i]] | 0xFF000000; - } -#else - for (u32 i = 0; i < total; i++) { - output[i] = palette[fb[i]] | 0xFF000000; - } -#endif } diff --git a/src/gfx/pxl8_shader.h b/src/gfx/pxl8_shader.h index 3c41386..3241a21 100644 --- a/src/gfx/pxl8_shader.h +++ b/src/gfx/pxl8_shader.h @@ -59,20 +59,7 @@ typedef struct pxl8_shader_bindings { }; } pxl8_shader_bindings; -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) typedef struct pxl8_shader_ctx { - u32 color_count; - pxl8_i32_simd x, y; - pxl8_vec2_simd v_uv; - pxl8_vec3_simd v_world; - pxl8_vec3_simd v_normal; - pxl8_f32_simd v_color; - pxl8_f32_simd v_light; - pxl8_f32_simd v_depth; -} pxl8_shader_ctx; -#else -typedef struct pxl8_shader_ctx { - u32 color_count; i32 x, y; pxl8_vec2 v_uv; pxl8_vec3 v_world; @@ -80,14 +67,13 @@ typedef struct pxl8_shader_ctx { f32 v_color; f32 v_light; f32 v_depth; + u32 out_light_color; } pxl8_shader_ctx; -#endif -typedef void (*pxl8_shader_fn)( - const pxl8_shader_ctx* ctx, +typedef u8 (*pxl8_shader_fn)( + pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, - const pxl8_shader_uniforms* uniforms, - u8* colors_out + const pxl8_shader_uniforms* uniforms ); #ifdef __cplusplus diff --git a/src/gfx/pxl8_shader_builtins.h b/src/gfx/pxl8_shader_builtins.h index a4eaf5f..abde7a5 100644 --- a/src/gfx/pxl8_shader_builtins.h +++ b/src/gfx/pxl8_shader_builtins.h @@ -43,23 +43,36 @@ static inline u8 pxl8_sample_indexed(const pxl8_shader_bindings* b, pxl8_vec2 uv } } -static inline u8 pxl8_colormap_lookup(const pxl8_shader_bindings* b, u8 color, u8 light_color, u8 intensity) { +static inline u8 pxl8_colormap_lookup(const pxl8_shader_bindings* b, u8 color, u8 light) { if (!b) return color; const u8* cm = b->colormap; if (!cm) return color; - u32 row = ((u32)light_color << 3) + (intensity >> 5); + u32 row = light >> 5; return cm[(row << 8) | (u32)color]; } -static inline u8 pxl8_colormap_lookup_dithered(const pxl8_shader_bindings* b, u8 color, u8 light_color, u8 intensity, u32 x, u32 y) { - if (!b) return color; - const u8* cm = b->colormap; - if (!cm) return color; - u32 base_row = intensity >> 5; - u32 frac = intensity & 31; - u32 threshold = PXL8_BAYER_4X4[(y & 3) * 4 + (x & 3)] * 2; - u32 row = ((u32)light_color << 3) + (frac > threshold && base_row < 7 ? base_row + 1 : base_row); - return cm[(row << 8) | (u32)color]; +static inline f32 pxl8_light_falloff(const pxl8_shader_ctx* ctx, const pxl8_shader_uniforms* u, u32 light_idx) { + if (!u || light_idx >= u->lights_count) return 0.0f; + const pxl8_light* light = &u->lights[light_idx]; + f32 dx = light->position.x - ctx->v_world.x; + f32 dy = light->position.y - ctx->v_world.y; + f32 dz = light->position.z - ctx->v_world.z; + f32 dist_sq = dx * dx + dy * dy + dz * dz; + if (dist_sq >= light->radius_sq) return 0.0f; + return 1.0f - dist_sq * light->inv_radius_sq; +} + +static inline u32 pxl8_light_color(const pxl8_shader_uniforms* u, u32 light_idx) { + if (!u || light_idx >= u->lights_count) return 0; + const pxl8_light* light = &u->lights[light_idx]; + return (u32)light->r | ((u32)light->g << 8) | ((u32)light->b << 16); +} + +static inline void pxl8_set_light_tint(pxl8_shader_ctx* ctx, u32 color, f32 strength) { + if (strength <= 0.0f) return; + if (strength > 1.0f) strength = 1.0f; + u32 alpha = (u32)(strength * 255.0f); + ctx->out_light_color = (color & 0x00FFFFFF) | (alpha << 24); } #ifdef __cplusplus diff --git a/src/gfx/pxl8_shader_math.h b/src/gfx/pxl8_shader_math.h deleted file mode 100644 index 0e87c10..0000000 --- a/src/gfx/pxl8_shader_math.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include "pxl8_math.h" - -#if defined(PXL8_SIMD_NEON) || defined(PXL8_SIMD_SSE) - -#undef f32 -#define f32 pxl8_f32_simd - -typedef pxl8_vec2_simd vec2; -typedef pxl8_vec3_simd vec3; -typedef pxl8_vec4_simd vec4; - -#define set pxl8_f32_simd_set -#define add pxl8_f32_simd_add -#define sub pxl8_f32_simd_sub -#define mul pxl8_f32_simd_mul -#define div pxl8_f32_simd_div -#define inversesqrt pxl8_f32_simd_rsqrt -#define rcp pxl8_f32_simd_rcp -#define max pxl8_f32_simd_max -#define min pxl8_f32_simd_min -#define clamp pxl8_f32_simd_clamp - -#define vec3_set pxl8_vec3_simd_set -#define vec3_add pxl8_vec3_simd_add -#define vec3_sub pxl8_vec3_simd_sub -#define vec3_scale pxl8_vec3_simd_scale -#define dot pxl8_vec3_simd_dot -#define normalize pxl8_vec3_simd_normalize - -#else - -typedef pxl8_vec2 vec2; -typedef pxl8_vec3 vec3; -typedef pxl8_vec4 vec4; - -#define set(x) (x) -#define add(a, b) ((a) + (b)) -#define sub(a, b) ((a) - (b)) -#define mul(a, b) ((a) * (b)) -#define div(a, b) ((a) / (b)) -#define inversesqrt pxl8_fast_inv_sqrt -#define rcp(x) (1.0f / (x)) -#define max pxl8_max -#define min pxl8_min -#define clamp pxl8_clamp - -#define vec3_set(v) (v) -#define vec3_add pxl8_vec3_add -#define vec3_sub pxl8_vec3_sub -#define vec3_scale pxl8_vec3_scale -#define dot pxl8_vec3_dot -#define normalize pxl8_vec3_normalize - -#endif diff --git a/src/gfx/pxl8_shader_registry.c b/src/gfx/pxl8_shader_registry.c index 0461b4e..370f183 100644 --- a/src/gfx/pxl8_shader_registry.c +++ b/src/gfx/pxl8_shader_registry.c @@ -2,14 +2,14 @@ #include -extern void pxl8_shader_lit(const pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, const pxl8_shader_uniforms* uniforms, u8* colors_out); -extern void pxl8_shader_unlit(const pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, const pxl8_shader_uniforms* uniforms, u8* colors_out); +extern u8 pxl8_shader_lit(pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, const pxl8_shader_uniforms* uniforms); +extern u8 pxl8_shader_unlit(pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, const pxl8_shader_uniforms* uniforms); void pxl8_shader_registry_init(void) {} void pxl8_shader_registry_reload(void) {} pxl8_shader_fn pxl8_shader_registry_get(const char* name) { - if (strcmp(name, "lit") == 0) return pxl8_shader_lit; - if (strcmp(name, "unlit") == 0) return pxl8_shader_unlit; + if (strcmp(name, "lit") == 0) return (pxl8_shader_fn)pxl8_shader_lit; + if (strcmp(name, "unlit") == 0) return (pxl8_shader_fn)pxl8_shader_unlit; return NULL; } diff --git a/src/gfx/shaders/cpu/lit.c b/src/gfx/shaders/cpu/lit.c index b776fef..b9d55c9 100644 --- a/src/gfx/shaders/cpu/lit.c +++ b/src/gfx/shaders/cpu/lit.c @@ -2,127 +2,15 @@ #include "pxl8_shader.h" #include "pxl8_shader_builtins.h" -void pxl8_shader_lit( - const pxl8_shader_ctx* ctx, +u8 pxl8_shader_lit( + pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, - const pxl8_shader_uniforms* uniforms, - u8* colors_out + const pxl8_shader_uniforms* uniforms ) { -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) - f32 uv_x[4], uv_y[4], color_f[4]; - i32 px[4], py[4]; - pxl8_f32_simd_store(uv_x, ctx->v_uv.x); - pxl8_f32_simd_store(uv_y, ctx->v_uv.y); - pxl8_f32_simd_store(color_f, ctx->v_color); - pxl8_i32_simd_store(px, ctx->x); - pxl8_i32_simd_store(py, ctx->y); - - u8 tex_idx[4]; - for (u32 i = 0; i < ctx->color_count; i++) { - if (bindings && bindings->atlas) { - tex_idx[i] = pxl8_sample_indexed(bindings, (pxl8_vec2){{ uv_x[i], uv_y[i] }}); - } else { - if (uniforms && uniforms->dither) { - tex_idx[i] = pxl8_gfx_dither(color_f[i], (u32)px[i], (u32)py[i]); - } else { - f32 clamped = pxl8_clamp(color_f[i], 0.0f, 255.0f); - tex_idx[i] = (u8)(clamped); - } - } - } - - pxl8_f32_simd light = ctx->v_light; - f32 max_strength[4] = {0, 0, 0, 0}; - u8 dominant_color[4] = {0, 0, 0, 0}; - - if (uniforms) { - pxl8_f32_simd ambient = pxl8_f32_simd_set((f32)uniforms->ambient / 255.0f); - light = pxl8_f32_simd_max(light, ambient); - - if (uniforms->celestial_intensity > 0.0f) { - pxl8_vec3_simd cel_dir = pxl8_vec3_simd_set(uniforms->celestial_dir); - pxl8_f32_simd ndotl = pxl8_f32_simd_sub( - pxl8_f32_simd_zero(), - pxl8_vec3_simd_dot(ctx->v_normal, cel_dir) - ); - pxl8_f32_simd cel_contrib = pxl8_f32_simd_mul( - pxl8_f32_simd_max(ndotl, pxl8_f32_simd_zero()), - pxl8_f32_simd_set(uniforms->celestial_intensity) - ); - light = pxl8_f32_simd_add(light, cel_contrib); - } - - for (u32 i = 0; i < uniforms->lights_count; i++) { - const pxl8_light* l = &uniforms->lights[i]; - pxl8_vec3_simd light_pos = pxl8_vec3_simd_set(l->position); - pxl8_vec3_simd to_light = pxl8_vec3_simd_sub(light_pos, ctx->v_world); - pxl8_f32_simd dist_sq = pxl8_vec3_simd_dot(to_light, to_light); - - pxl8_f32_simd in_range = pxl8_f32_simd_cmpgt( - pxl8_f32_simd_set(l->radius_sq), dist_sq - ); - if (!pxl8_f32_simd_movemask(in_range)) continue; - - pxl8_f32_simd inv_dist = pxl8_f32_simd_rsqrt(dist_sq); - pxl8_vec3_simd light_dir = pxl8_vec3_simd_scale(to_light, inv_dist); - pxl8_f32_simd ndotl = pxl8_vec3_simd_dot(ctx->v_normal, light_dir); - ndotl = pxl8_f32_simd_max(ndotl, pxl8_f32_simd_zero()); - - pxl8_f32_simd falloff = pxl8_f32_simd_sub( - pxl8_f32_simd_set(1.0f), - pxl8_f32_simd_mul(dist_sq, pxl8_f32_simd_set(l->inv_radius_sq)) - ); - falloff = pxl8_f32_simd_max(falloff, pxl8_f32_simd_zero()); - - if (uniforms->dither) { - f32 falloff_arr[4]; - pxl8_f32_simd_store(falloff_arr, falloff); - for (u32 j = 0; j < 4; j++) { - if (falloff_arr[j] < 0.5f) { - f32 threshold = (PXL8_BAYER_4X4[((u32)py[j] & 3) * 4 + ((u32)px[j] & 3)] + 0.5f) * (1.0f / 16.0f); - if (falloff_arr[j] < threshold * 0.5f) falloff_arr[j] = 0.0f; - } - } - falloff = pxl8_f32_simd_load(falloff_arr); - } - - pxl8_f32_simd strength = pxl8_f32_simd_mul( - pxl8_f32_simd_mul(pxl8_f32_simd_set(l->intensity), falloff), - ndotl - ); - - f32 strength_arr[4]; - pxl8_f32_simd_store(strength_arr, strength); - for (u32 j = 0; j < 4; j++) { - if (strength_arr[j] > max_strength[j]) { - max_strength[j] = strength_arr[j]; - dominant_color[j] = l->color; - } - } - - light = pxl8_f32_simd_add(light, strength); - } - } - - light = pxl8_f32_simd_clamp(light, pxl8_f32_simd_zero(), pxl8_f32_simd_set(1.0f)); - pxl8_f32_simd light_f = pxl8_f32_simd_mul(light, pxl8_f32_simd_set(255.0f)); - - f32 light_arr[4]; - pxl8_f32_simd_store(light_arr, light_f); - - for (u32 i = 0; i < ctx->color_count; i++) { - u8 light_u8 = (u8)light_arr[i]; - if (uniforms && uniforms->dither) { - colors_out[i] = pxl8_colormap_lookup_dithered(bindings, tex_idx[i], dominant_color[i], light_u8, (u32)px[i], (u32)py[i]); - } else { - colors_out[i] = pxl8_colormap_lookup(bindings, tex_idx[i], dominant_color[i], light_u8); - } - } - -#else u8 tex_idx = 0; if (bindings && bindings->atlas) { tex_idx = pxl8_sample_indexed(bindings, ctx->v_uv); + if (pxl8_unlikely(tex_idx == 0)) return 0; } else { if (uniforms && uniforms->dither) { tex_idx = pxl8_gfx_dither(ctx->v_color, (u32)ctx->x, (u32)ctx->y); @@ -133,8 +21,6 @@ void pxl8_shader_lit( } f32 light = ctx->v_light; - f32 max_strength = 0; - u8 dominant_color = 0; if (uniforms) { f32 ambient = (f32)uniforms->ambient / 255.0f; @@ -149,6 +35,11 @@ void pxl8_shader_lit( } } + f32 dyn_strength = 0.0f; + f32 dyn_r = 0.0f; + f32 dyn_g = 0.0f; + f32 dyn_b = 0.0f; + for (u32 i = 0; i < uniforms->lights_count; i++) { const pxl8_light* l = &uniforms->lights[i]; f32 lx = l->position.x - ctx->v_world.x; @@ -167,17 +58,28 @@ void pxl8_shader_lit( f32 falloff = 1.0f - dist_sq * l->inv_radius_sq; if (falloff <= 0.0f) continue; - if (uniforms->dither && falloff < 0.5f) { + if (uniforms->dither && falloff < 0.33f) { f32 threshold = (PXL8_BAYER_4X4[((u32)ctx->y & 3) * 4 + ((u32)ctx->x & 3)] + 0.5f) * (1.0f / 16.0f); - if (falloff < threshold * 0.5f) continue; + if (falloff < threshold * 0.33f) continue; } - f32 strength = l->intensity * falloff * ndotl; - if (strength > max_strength) { - max_strength = strength; - dominant_color = l->color; - } - light += strength; + f32 strength = ((f32)l->intensity / 255.0f) * falloff * ndotl; + if (strength <= 0.0f) continue; + + dyn_strength += strength; + dyn_r += strength * (f32)l->r; + dyn_g += strength * (f32)l->g; + dyn_b += strength * (f32)l->b; + } + + if (dyn_strength > 0.0f) { + f32 inv = pxl8_fast_rcp(dyn_strength); + u8 r = (u8)pxl8_clamp(dyn_r * inv, 0.0f, 255.0f); + u8 g = (u8)pxl8_clamp(dyn_g * inv, 0.0f, 255.0f); + u8 b = (u8)pxl8_clamp(dyn_b * inv, 0.0f, 255.0f); + u8 a = (u8)pxl8_clamp(dyn_strength * 255.0f, 0.0f, 255.0f); + ctx->out_light_color = (u32)r | ((u32)g << 8) | ((u32)b << 16) | ((u32)a << 24); + light += dyn_strength; } } @@ -187,9 +89,18 @@ void pxl8_shader_lit( f32 light_f = light * 255.0f; u8 light_u8 = (u8)light_f; if (uniforms && uniforms->dither) { - colors_out[0] = pxl8_colormap_lookup_dithered(bindings, tex_idx, dominant_color, light_u8, (u32)ctx->x, (u32)ctx->y); - } else { - colors_out[0] = pxl8_colormap_lookup(bindings, tex_idx, dominant_color, light_u8); + light_u8 = pxl8_gfx_dither(light_f, (u32)ctx->x, (u32)ctx->y); } -#endif + + u8 shaded = pxl8_colormap_lookup(bindings, tex_idx, light_u8); + + if (uniforms && uniforms->emissive) { + u32 rgb = 0x00FFFFFF; + if (bindings && bindings->palette) { + rgb = bindings->palette[tex_idx] & 0x00FFFFFF; + } + pxl8_set_light_tint(ctx, rgb, 1.0f); + } + + return shaded; } diff --git a/src/gfx/shaders/cpu/unlit.c b/src/gfx/shaders/cpu/unlit.c index 865f09b..b3ff4c6 100644 --- a/src/gfx/shaders/cpu/unlit.c +++ b/src/gfx/shaders/cpu/unlit.c @@ -1,37 +1,15 @@ #include "pxl8_shader.h" #include "pxl8_shader_builtins.h" -void pxl8_shader_unlit( - const pxl8_shader_ctx* ctx, +u8 pxl8_shader_unlit( + pxl8_shader_ctx* ctx, const pxl8_shader_bindings* bindings, - const pxl8_shader_uniforms* uniforms, - u8* colors_out + const pxl8_shader_uniforms* uniforms ) { -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) - f32 uv_x[4], uv_y[4], color_f[4]; - i32 px[4], py[4]; - pxl8_f32_simd_store(uv_x, ctx->v_uv.x); - pxl8_f32_simd_store(uv_y, ctx->v_uv.y); - pxl8_f32_simd_store(color_f, ctx->v_color); - pxl8_i32_simd_store(px, ctx->x); - pxl8_i32_simd_store(py, ctx->y); - - for (u32 i = 0; i < ctx->color_count; i++) { - if (bindings && bindings->atlas) { - colors_out[i] = pxl8_sample_indexed(bindings, (pxl8_vec2){{ uv_x[i], uv_y[i] }}); - } else { - if (uniforms && uniforms->dither) { - colors_out[i] = pxl8_gfx_dither(color_f[i], (u32)px[i], (u32)py[i]); - } else { - f32 clamped = pxl8_clamp(color_f[i], 0.0f, 255.0f); - colors_out[i] = (u8)(clamped); - } - } - } -#else u8 tex_idx = 0; if (bindings && bindings->atlas) { tex_idx = pxl8_sample_indexed(bindings, ctx->v_uv); + if (tex_idx == 0) return 0; } else { if (uniforms && uniforms->dither) { tex_idx = pxl8_gfx_dither(ctx->v_color, (u32)ctx->x, (u32)ctx->y); @@ -40,6 +18,14 @@ void pxl8_shader_unlit( tex_idx = (u8)(clamped); } } - colors_out[0] = tex_idx; -#endif + + if (uniforms && uniforms->emissive) { + u32 rgb = 0x00FFFFFF; + if (bindings && bindings->palette) { + rgb = bindings->palette[tex_idx] & 0x00FFFFFF; + } + pxl8_set_light_tint(ctx, rgb, 1.0f); + } + + return tex_idx; } diff --git a/src/lua/pxl8/effects.lua b/src/lua/pxl8/effects.lua index 567d64c..f38381d 100644 --- a/src/lua/pxl8/effects.lua +++ b/src/lua/pxl8/effects.lua @@ -14,8 +14,16 @@ function Lights.new(capacity) return setmetatable({ _ptr = ptr }, Lights) end -function Lights:add(x, y, z, color, intensity, radius) - C.pxl8_lights_add(self._ptr, x, y, z, color or 0, intensity or 255, radius or 10) +function Lights:add(x, y, z, r, g, b, intensity, radius) + if r and r > 255 then + local rgb = r + intensity = g + radius = b + r = bit.band(bit.rshift(rgb, 16), 0xFF) + g = bit.band(bit.rshift(rgb, 8), 0xFF) + b = bit.band(rgb, 0xFF) + end + C.pxl8_lights_add(self._ptr, x, y, z, r or 255, g or 255, b or 255, intensity or 255, radius or 10) end function Lights:clear() diff --git a/src/math/pxl8_math.h b/src/math/pxl8_math.h index 4288732..6cf314b 100644 --- a/src/math/pxl8_math.h +++ b/src/math/pxl8_math.h @@ -4,25 +4,12 @@ #include "pxl8_types.h" -#if defined(__x86_64__) || defined(_M_X64) - #define PXL8_SIMD_SSE - #include - #include - typedef __m128 pxl8_f32_simd; - typedef __m128i pxl8_i32_simd; - typedef __m128i pxl8_i16x8_simd; -#elif defined(__aarch64__) || defined(_M_ARM64) - #define PXL8_SIMD_NEON - #include - typedef float32x4_t pxl8_f32_simd; - typedef int32x4_t pxl8_i32_simd; - typedef int16x8_t pxl8_i16x8_simd; -#endif - -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) -typedef struct pxl8_vec2_simd { pxl8_f32_simd x, y; } pxl8_vec2_simd; -typedef struct pxl8_vec3_simd { pxl8_f32_simd x, y, z; } pxl8_vec3_simd; -typedef struct pxl8_vec4_simd { pxl8_f32_simd x, y, z, w; } pxl8_vec4_simd; +#ifndef PXL8_NO_SIMD + #if defined(__x86_64__) || defined(_M_X64) + #include + #elif defined(__aarch64__) || defined(_M_ARM64) + #include + #endif #endif #define PXL8_PI 3.14159265358979323846f @@ -135,7 +122,3 @@ bool pxl8_frustum_test_aabb(const pxl8_frustum* frustum, pxl8_vec3 min, pxl8_vec #ifdef __cplusplus } #endif - -#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON) -#include "pxl8_math_simd.h" -#endif diff --git a/src/math/pxl8_math_scalar.h b/src/math/pxl8_math_scalar.h deleted file mode 100644 index 27f9da1..0000000 --- a/src/math/pxl8_math_scalar.h +++ /dev/null @@ -1,265 +0,0 @@ -#pragma once - -static inline f32 pxl8_fast_inv_sqrt(f32 x) { - f32 half = 0.5f * x; - i32 i = *(i32*)&x; - i = 0x5f3759df - (i >> 1); - x = *(f32*)&i; - x = x * (1.5f - half * x * x); - return x; -} - -static inline u32 pxl8_hash32(u32 x) { - x ^= x >> 16; - x *= 0x85EBCA6Bu; - x ^= x >> 13; - x *= 0xC2B2AE35u; - x ^= x >> 16; - return x; -} - -static inline u64 pxl8_hash64(u64 x) { - x ^= x >> 33; - x *= 0xff51afd7ed558ccdULL; - x ^= x >> 33; - x *= 0xc4ceb9fe1a85ec53ULL; - x ^= x >> 33; - return x; -} - -static inline f32 pxl8_smoothstep(f32 t) { - return t * t * (3.0f - 2.0f * t); -} - -static inline pxl8_vec2 pxl8_vec2_add(pxl8_vec2 a, pxl8_vec2 b) { - return (pxl8_vec2){ .x = a.x + b.x, .y = a.y + b.y }; -} - -static inline pxl8_vec2 pxl8_vec2_sub(pxl8_vec2 a, pxl8_vec2 b) { - return (pxl8_vec2){ .x = a.x - b.x, .y = a.y - b.y }; -} - -static inline pxl8_vec2 pxl8_vec2_scale(pxl8_vec2 v, f32 s) { - return (pxl8_vec2){ .x = v.x * s, .y = v.y * s }; -} - -static inline f32 pxl8_vec2_cross(pxl8_vec2 a, pxl8_vec2 b) { - return a.x * b.y - a.y * b.x; -} - -static inline f32 pxl8_vec2_dot(pxl8_vec2 a, pxl8_vec2 b) { - return a.x * b.x + a.y * b.y; -} - -static inline f32 pxl8_vec2_length(pxl8_vec2 v) { - return sqrtf(v.x * v.x + v.y * v.y); -} - -static inline pxl8_vec2 pxl8_vec2_normalize(pxl8_vec2 v) { - f32 len_sq = pxl8_vec2_dot(v, v); - if (len_sq < 1e-12f) return (pxl8_vec2){0}; - return pxl8_vec2_scale(v, pxl8_fast_inv_sqrt(len_sq)); -} - -static inline pxl8_vec3 pxl8_vec3_add(pxl8_vec3 a, pxl8_vec3 b) { - return (pxl8_vec3){ .x = a.x + b.x, .y = a.y + b.y, .z = a.z + b.z }; -} - -static inline pxl8_vec3 pxl8_vec3_sub(pxl8_vec3 a, pxl8_vec3 b) { - return (pxl8_vec3){ .x = a.x - b.x, .y = a.y - b.y, .z = a.z - b.z }; -} - -static inline pxl8_vec3 pxl8_vec3_scale(pxl8_vec3 v, f32 s) { - return (pxl8_vec3){ .x = v.x * s, .y = v.y * s, .z = v.z * s }; -} - -static inline f32 pxl8_vec3_dot(pxl8_vec3 a, pxl8_vec3 b) { - return a.x * b.x + a.y * b.y + a.z * b.z; -} - -static inline pxl8_vec3 pxl8_vec3_cross(pxl8_vec3 a, pxl8_vec3 b) { - return (pxl8_vec3){ - .x = a.y * b.z - a.z * b.y, - .y = a.z * b.x - a.x * b.z, - .z = a.x * b.y - a.y * b.x, - }; -} - -static inline f32 pxl8_vec3_length(pxl8_vec3 v) { - return sqrtf(pxl8_vec3_dot(v, v)); -} - -static inline pxl8_vec3 pxl8_vec3_lerp(pxl8_vec3 a, pxl8_vec3 b, f32 t) { - return (pxl8_vec3){ - a.x + (b.x - a.x) * t, - a.y + (b.y - a.y) * t, - a.z + (b.z - a.z) * t - }; -} - -static inline pxl8_vec3 pxl8_vec3_normalize(pxl8_vec3 v) { - f32 len_sq = pxl8_vec3_dot(v, v); - if (len_sq < 1e-12f) return (pxl8_vec3){0}; - return pxl8_vec3_scale(v, pxl8_fast_inv_sqrt(len_sq)); -} - -static inline pxl8_mat4 pxl8_mat4_identity(void) { - pxl8_mat4 mat = {0}; - mat.m[0] = mat.m[5] = mat.m[10] = mat.m[15] = 1.0f; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_multiply(pxl8_mat4 a, pxl8_mat4 b) { - pxl8_mat4 mat = {0}; - for (i32 col = 0; col < 4; col++) { - for (i32 row = 0; row < 4; row++) { - mat.m[col * 4 + row] = - a.m[0 * 4 + row] * b.m[col * 4 + 0] + - a.m[1 * 4 + row] * b.m[col * 4 + 1] + - a.m[2 * 4 + row] * b.m[col * 4 + 2] + - a.m[3 * 4 + row] * b.m[col * 4 + 3]; - } - } - return mat; -} - -static inline pxl8_vec4 pxl8_mat4_multiply_vec4(pxl8_mat4 m, pxl8_vec4 v) { - return (pxl8_vec4){ - .x = m.m[0] * v.x + m.m[4] * v.y + m.m[8] * v.z + m.m[12] * v.w, - .y = m.m[1] * v.x + m.m[5] * v.y + m.m[9] * v.z + m.m[13] * v.w, - .z = m.m[2] * v.x + m.m[6] * v.y + m.m[10] * v.z + m.m[14] * v.w, - .w = m.m[3] * v.x + m.m[7] * v.y + m.m[11] * v.z + m.m[15] * v.w, - }; -} - -static inline pxl8_vec3 pxl8_mat4_multiply_vec3(pxl8_mat4 m, pxl8_vec3 v) { - return (pxl8_vec3){ - .x = m.m[0] * v.x + m.m[4] * v.y + m.m[8] * v.z, - .y = m.m[1] * v.x + m.m[5] * v.y + m.m[9] * v.z, - .z = m.m[2] * v.x + m.m[6] * v.y + m.m[10] * v.z, - }; -} - -static inline pxl8_mat4 pxl8_mat4_translate(f32 x, f32 y, f32 z) { - pxl8_mat4 mat = pxl8_mat4_identity(); - mat.m[12] = x; mat.m[13] = y; mat.m[14] = z; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_rotate_x(f32 angle) { - pxl8_mat4 mat = pxl8_mat4_identity(); - f32 c = cosf(angle), s = sinf(angle); - mat.m[5] = c; mat.m[9] = -s; mat.m[6] = s; mat.m[10] = c; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_rotate_y(f32 angle) { - pxl8_mat4 mat = pxl8_mat4_identity(); - f32 c = cosf(angle), s = sinf(angle); - mat.m[0] = c; mat.m[8] = s; mat.m[2] = -s; mat.m[10] = c; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_rotate_z(f32 angle) { - pxl8_mat4 mat = pxl8_mat4_identity(); - f32 c = cosf(angle), s = sinf(angle); - mat.m[0] = c; mat.m[4] = -s; mat.m[1] = s; mat.m[5] = c; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_scale(f32 x, f32 y, f32 z) { - pxl8_mat4 mat = pxl8_mat4_identity(); - mat.m[0] = x; mat.m[5] = y; mat.m[10] = z; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_orthographic(f32 left, f32 right, f32 bottom, f32 top, f32 near, f32 far) { - pxl8_mat4 mat = {0}; - mat.m[0] = 2.0f / (right - left); - mat.m[5] = 2.0f / (top - bottom); - mat.m[10] = -2.0f / (far - near); - mat.m[12] = -(right + left) / (right - left); - mat.m[13] = -(top + bottom) / (top - bottom); - mat.m[14] = -(far + near) / (far - near); - mat.m[15] = 1.0f; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_perspective(f32 fov, f32 aspect, f32 near, f32 far) { - pxl8_mat4 mat = {0}; - f32 tan_half_fov = tanf(fov / 2.0f); - mat.m[0] = 1.0f / (aspect * tan_half_fov); - mat.m[5] = 1.0f / tan_half_fov; - mat.m[10] = -(far + near) / (far - near); - mat.m[14] = -(2.0f * far * near) / (far - near); - mat.m[11] = -1.0f; - return mat; -} - -static inline pxl8_mat4 pxl8_mat4_lookat(pxl8_vec3 eye, pxl8_vec3 center, pxl8_vec3 up) { - pxl8_mat4 mat = pxl8_mat4_identity(); - pxl8_vec3 f = pxl8_vec3_normalize(pxl8_vec3_sub(center, eye)); - pxl8_vec3 s = pxl8_vec3_normalize(pxl8_vec3_cross(f, up)); - pxl8_vec3 u = pxl8_vec3_cross(s, f); - mat.m[0] = s.x; mat.m[4] = s.y; mat.m[8] = s.z; - mat.m[1] = u.x; mat.m[5] = u.y; mat.m[9] = u.z; - mat.m[2] = -f.x; mat.m[6] = -f.y; mat.m[10] = -f.z; - mat.m[12] = -pxl8_vec3_dot(s, eye); - mat.m[13] = -pxl8_vec3_dot(u, eye); - mat.m[14] = pxl8_vec3_dot(f, eye); - return mat; -} - -static inline pxl8_frustum pxl8_frustum_from_matrix(pxl8_mat4 vp) { - pxl8_frustum frustum; - const f32* m = vp.m; - frustum.planes[0].normal.x = m[3] + m[0]; - frustum.planes[0].normal.y = m[7] + m[4]; - frustum.planes[0].normal.z = m[11] + m[8]; - frustum.planes[0].distance = m[15] + m[12]; - frustum.planes[1].normal.x = m[3] - m[0]; - frustum.planes[1].normal.y = m[7] - m[4]; - frustum.planes[1].normal.z = m[11] - m[8]; - frustum.planes[1].distance = m[15] - m[12]; - frustum.planes[2].normal.x = m[3] + m[1]; - frustum.planes[2].normal.y = m[7] + m[5]; - frustum.planes[2].normal.z = m[11] + m[9]; - frustum.planes[2].distance = m[15] + m[13]; - frustum.planes[3].normal.x = m[3] - m[1]; - frustum.planes[3].normal.y = m[7] - m[5]; - frustum.planes[3].normal.z = m[11] - m[9]; - frustum.planes[3].distance = m[15] - m[13]; - frustum.planes[4].normal.x = m[3] + m[2]; - frustum.planes[4].normal.y = m[7] + m[6]; - frustum.planes[4].normal.z = m[11] + m[10]; - frustum.planes[4].distance = m[15] + m[14]; - frustum.planes[5].normal.x = m[3] - m[2]; - frustum.planes[5].normal.y = m[7] - m[6]; - frustum.planes[5].normal.z = m[11] - m[10]; - frustum.planes[5].distance = m[15] - m[14]; - for (i32 i = 0; i < 6; i++) { - f32 len = pxl8_vec3_length(frustum.planes[i].normal); - if (len > 1e-6f) { - f32 inv_len = 1.0f / len; - frustum.planes[i].normal = pxl8_vec3_scale(frustum.planes[i].normal, inv_len); - frustum.planes[i].distance *= inv_len; - } - } - return frustum; -} - -static inline bool pxl8_frustum_test_aabb(const pxl8_frustum* frustum, pxl8_vec3 min, pxl8_vec3 max) { - const f32 FRUSTUM_EPSILON = -75.0f; - for (i32 i = 0; i < 6; i++) { - pxl8_vec3 normal = frustum->planes[i].normal; - f32 d = frustum->planes[i].distance; - pxl8_vec3 p_vertex = { - (normal.x > 0.0f) ? max.x : min.x, - (normal.y > 0.0f) ? max.y : min.y, - (normal.z > 0.0f) ? max.z : min.z - }; - f32 p_dist = pxl8_vec3_dot(normal, p_vertex) + d; - if (p_dist < FRUSTUM_EPSILON) return false; - } - return true; -} diff --git a/src/math/pxl8_math_simd.h b/src/math/pxl8_math_simd.h deleted file mode 100644 index 1d900f7..0000000 --- a/src/math/pxl8_math_simd.h +++ /dev/null @@ -1,389 +0,0 @@ -#pragma once - -#if defined(PXL8_SIMD_NEON) || defined(PXL8_SIMD_SSE) - -static inline pxl8_f32_simd pxl8_f32_simd_set(f32 x) { -#if defined(PXL8_SIMD_NEON) - return vdupq_n_f32(x); -#else - return _mm_set1_ps(x); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_set4(f32 a, f32 b, f32 c, f32 d) { -#if defined(PXL8_SIMD_NEON) - f32 arr[4] = {a, b, c, d}; - return vld1q_f32(arr); -#else - return _mm_set_ps(d, c, b, a); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_zero(void) { -#if defined(PXL8_SIMD_NEON) - return vdupq_n_f32(0.0f); -#else - return _mm_setzero_ps(); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_load(const f32* ptr) { -#if defined(PXL8_SIMD_NEON) - return vld1q_f32(ptr); -#else - return _mm_loadu_ps(ptr); -#endif -} - -static inline void pxl8_f32_simd_store(f32* ptr, pxl8_f32_simd v) { -#if defined(PXL8_SIMD_NEON) - vst1q_f32(ptr, v); -#else - _mm_storeu_ps(ptr, v); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_add(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vaddq_f32(a, b); -#else - return _mm_add_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_sub(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vsubq_f32(a, b); -#else - return _mm_sub_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_mul(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vmulq_f32(a, b); -#else - return _mm_mul_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_div(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vdivq_f32(a, b); -#else - return _mm_div_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_rsqrt(pxl8_f32_simd x) { -#if defined(PXL8_SIMD_NEON) - float32x4_t est = vrsqrteq_f32(x); - est = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(x, est), est)); - return est; -#else - return _mm_rsqrt_ps(x); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_rcp(pxl8_f32_simd x) { -#if defined(PXL8_SIMD_NEON) - float32x4_t est = vrecpeq_f32(x); - est = vmulq_f32(est, vrecpsq_f32(x, est)); - return est; -#else - pxl8_f32_simd rcp = _mm_rcp_ps(x); - rcp = _mm_add_ps(rcp, _mm_mul_ps(rcp, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(x, rcp)))); - return rcp; -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_max(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vmaxq_f32(a, b); -#else - return _mm_max_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_min(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vminq_f32(a, b); -#else - return _mm_min_ps(a, b); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_clamp(pxl8_f32_simd x, pxl8_f32_simd lo, pxl8_f32_simd hi) { - return pxl8_f32_simd_min(pxl8_f32_simd_max(x, lo), hi); -} - -static inline pxl8_i32_simd pxl8_f32_simd_to_i32(pxl8_f32_simd v) { -#if defined(PXL8_SIMD_NEON) - return vcvtq_s32_f32(v); -#else - return _mm_cvtps_epi32(v); -#endif -} - -static inline pxl8_f32_simd pxl8_f32_simd_cmpgt(pxl8_f32_simd a, pxl8_f32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vreinterpretq_f32_u32(vcgtq_f32(a, b)); -#else - return _mm_cmpgt_ps(a, b); -#endif -} - -static inline i32 pxl8_f32_simd_movemask(pxl8_f32_simd v) { -#if defined(PXL8_SIMD_NEON) - static const i32 shift_arr[4] = {0, 1, 2, 3}; - uint32x4_t mask = vreinterpretq_u32_f32(v); - int32x4_t shift = vld1q_s32(shift_arr); - mask = vshrq_n_u32(mask, 31); - mask = vshlq_u32(mask, shift); - return (i32)vaddvq_u32(mask); -#else - return _mm_movemask_ps(v); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_set(i32 x) { -#if defined(PXL8_SIMD_NEON) - return vdupq_n_s32(x); -#else - return _mm_set1_epi32(x); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_set4(i32 a, i32 b, i32 c, i32 d) { -#if defined(PXL8_SIMD_NEON) - i32 arr[4] = {a, b, c, d}; - return vld1q_s32(arr); -#else - return _mm_set_epi32(d, c, b, a); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_zero(void) { -#if defined(PXL8_SIMD_NEON) - return vdupq_n_s32(0); -#else - return _mm_setzero_si128(); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_load(const i32* ptr) { -#if defined(PXL8_SIMD_NEON) - return vld1q_s32(ptr); -#else - return _mm_loadu_si128((const __m128i*)ptr); -#endif -} - -static inline void pxl8_i32_simd_store(i32* ptr, pxl8_i32_simd v) { -#if defined(PXL8_SIMD_NEON) - vst1q_s32(ptr, v); -#else - _mm_storeu_si128((__m128i*)ptr, v); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_add(pxl8_i32_simd a, pxl8_i32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vaddq_s32(a, b); -#else - return _mm_add_epi32(a, b); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_sub(pxl8_i32_simd a, pxl8_i32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vsubq_s32(a, b); -#else - return _mm_sub_epi32(a, b); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_srli(pxl8_i32_simd v, i32 imm) { -#if defined(PXL8_SIMD_NEON) - int32x4_t shift = vdupq_n_s32(-imm); - return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), shift)); -#else - return _mm_srli_epi32(v, imm); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_or(pxl8_i32_simd a, pxl8_i32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vorrq_s32(a, b); -#else - return _mm_or_si128(a, b); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_and(pxl8_i32_simd a, pxl8_i32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vandq_s32(a, b); -#else - return _mm_and_si128(a, b); -#endif -} - -static inline pxl8_i32_simd pxl8_i32_simd_cmpgt(pxl8_i32_simd a, pxl8_i32_simd b) { -#if defined(PXL8_SIMD_NEON) - return vreinterpretq_s32_u32(vcgtq_s32(a, b)); -#else - return _mm_cmpgt_epi32(a, b); -#endif -} - -static inline i32 pxl8_i32_simd_movemask(pxl8_i32_simd v) { -#if defined(PXL8_SIMD_NEON) - static const i32 shift_arr[4] = {0, 1, 2, 3}; - uint32x4_t mask = vreinterpretq_u32_s32(v); - int32x4_t shift = vld1q_s32(shift_arr); - mask = vshrq_n_u32(mask, 31); - mask = vshlq_u32(mask, shift); - return (i32)vaddvq_u32(mask); -#else - return _mm_movemask_epi8(v); -#endif -} - -static inline pxl8_f32_simd pxl8_i32_simd_to_f32(pxl8_i32_simd v) { -#if defined(PXL8_SIMD_NEON) - return vcvtq_f32_s32(v); -#else - return _mm_cvtepi32_ps(v); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_set(i16 x) { -#if defined(PXL8_SIMD_NEON) - return vdupq_n_s16(x); -#else - return _mm_set1_epi16(x); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_add(pxl8_i16x8_simd a, pxl8_i16x8_simd b) { -#if defined(PXL8_SIMD_NEON) - return vaddq_s16(a, b); -#else - return _mm_add_epi16(a, b); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_sub(pxl8_i16x8_simd a, pxl8_i16x8_simd b) { -#if defined(PXL8_SIMD_NEON) - return vsubq_s16(a, b); -#else - return _mm_sub_epi16(a, b); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_mullo(pxl8_i16x8_simd a, pxl8_i16x8_simd b) { -#if defined(PXL8_SIMD_NEON) - return vmulq_s16(a, b); -#else - return _mm_mullo_epi16(a, b); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_srai(pxl8_i16x8_simd v, i32 imm) { -#if defined(PXL8_SIMD_NEON) - int16x8_t shift = vdupq_n_s16((i16)(-imm)); - return vshlq_s16(v, shift); -#else - return _mm_srai_epi16(v, imm); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i32_simd_unpacklo8(pxl8_i32_simd v) { -#if defined(PXL8_SIMD_NEON) - uint8x16_t bytes = vreinterpretq_u8_s32(v); - return vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bytes))); -#else - return _mm_unpacklo_epi8(v, _mm_setzero_si128()); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i32_simd_unpackhi8(pxl8_i32_simd v) { -#if defined(PXL8_SIMD_NEON) - uint8x16_t bytes = vreinterpretq_u8_s32(v); - return vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bytes))); -#else - return _mm_unpackhi_epi8(v, _mm_setzero_si128()); -#endif -} - -static inline pxl8_i16x8_simd pxl8_i16x8_simd_shuffle_3333(pxl8_i16x8_simd v) { -#if defined(PXL8_SIMD_NEON) - int16x4_t lo = vget_low_s16(v); - int16x4_t hi = vget_high_s16(v); - int16x4_t lo_bcast = vdup_lane_s16(lo, 3); - int16x4_t hi_bcast = vdup_lane_s16(hi, 3); - return vcombine_s16(lo_bcast, hi_bcast); -#else - return _mm_shufflehi_epi16( - _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 3, 3, 3)), - _MM_SHUFFLE(3, 3, 3, 3)); -#endif -} - -static inline pxl8_i32_simd pxl8_i16x8_simd_packus(pxl8_i16x8_simd lo, pxl8_i16x8_simd hi) { -#if defined(PXL8_SIMD_NEON) - uint8x8_t lo8 = vqmovun_s16(lo); - uint8x8_t hi8 = vqmovun_s16(hi); - return vreinterpretq_s32_u8(vcombine_u8(lo8, hi8)); -#else - return _mm_packus_epi16(lo, hi); -#endif -} - -static inline pxl8_vec3_simd pxl8_vec3_simd_set(pxl8_vec3 v) { - return (pxl8_vec3_simd){ - pxl8_f32_simd_set(v.x), - pxl8_f32_simd_set(v.y), - pxl8_f32_simd_set(v.z), - }; -} - -static inline pxl8_vec3_simd pxl8_vec3_simd_add(pxl8_vec3_simd a, pxl8_vec3_simd b) { - return (pxl8_vec3_simd){ - pxl8_f32_simd_add(a.x, b.x), - pxl8_f32_simd_add(a.y, b.y), - pxl8_f32_simd_add(a.z, b.z), - }; -} - -static inline pxl8_vec3_simd pxl8_vec3_simd_sub(pxl8_vec3_simd a, pxl8_vec3_simd b) { - return (pxl8_vec3_simd){ - pxl8_f32_simd_sub(a.x, b.x), - pxl8_f32_simd_sub(a.y, b.y), - pxl8_f32_simd_sub(a.z, b.z), - }; -} - -static inline pxl8_vec3_simd pxl8_vec3_simd_scale(pxl8_vec3_simd v, pxl8_f32_simd s) { - return (pxl8_vec3_simd){ - pxl8_f32_simd_mul(v.x, s), - pxl8_f32_simd_mul(v.y, s), - pxl8_f32_simd_mul(v.z, s), - }; -} - -static inline pxl8_f32_simd pxl8_vec3_simd_dot(pxl8_vec3_simd a, pxl8_vec3_simd b) { - return pxl8_f32_simd_add( - pxl8_f32_simd_add( - pxl8_f32_simd_mul(a.x, b.x), - pxl8_f32_simd_mul(a.y, b.y)), - pxl8_f32_simd_mul(a.z, b.z)); -} - -static inline pxl8_vec3_simd pxl8_vec3_simd_normalize(pxl8_vec3_simd v) { - pxl8_f32_simd len_sq = pxl8_vec3_simd_dot(v, v); - pxl8_f32_simd inv_len = pxl8_f32_simd_rsqrt(len_sq); - return pxl8_vec3_simd_scale(v, inv_len); -} - -#endif diff --git a/src/script/pxl8_script_ffi.h b/src/script/pxl8_script_ffi.h index a7577a3..036da56 100644 --- a/src/script/pxl8_script_ffi.h +++ b/src/script/pxl8_script_ffi.h @@ -205,17 +205,18 @@ static const char* pxl8_ffi_cdefs = "typedef struct { float m[16]; } pxl8_mat4;\n" "\n" "typedef struct pxl8_light {\n" -" u8 color;\n" -" f32 intensity;\n" -" f32 inv_radius_sq;\n" " pxl8_vec3 position;\n" +" f32 inv_radius_sq;\n" +" u8 r, g, b;\n" +" u8 intensity;\n" +" f32 radius;\n" " f32 radius_sq;\n" "} pxl8_light;\n" "\n" "typedef struct pxl8_lights pxl8_lights;\n" "pxl8_lights* pxl8_lights_create(u32 capacity);\n" "void pxl8_lights_destroy(pxl8_lights* lights);\n" -"void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 color, u8 intensity, f32 radius);\n" +"void pxl8_lights_add(pxl8_lights* lights, f32 x, f32 y, f32 z, u8 r, u8 g, u8 b, u8 intensity, f32 radius);\n" "void pxl8_lights_clear(pxl8_lights* lights);\n" "u32 pxl8_lights_count(const pxl8_lights* lights);\n" "const pxl8_light* pxl8_lights_data(const pxl8_lights* lights);\n"