pxl8/src/gfx/pxl8_render.c

1551 lines
57 KiB
C
Raw Normal View History

2026-02-02 17:48:25 -06:00
#include "pxl8_render.h"
#include "pxl8_atlas.h"
#include "pxl8_colormap.h"
#include "pxl8_dither.h"
#include "pxl8_hal.h"
#include "pxl8_log.h"
#include "pxl8_mem.h"
#include "pxl8_mesh.h"
#include "pxl8_shader.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>
#if PXL8_GFX_ENABLE_STATS
#define STATS_INC(stats, field, val) do { (stats)->field += (val); } while (0)
#define STATS_START() pxl8_get_ticks_ns()
#define STATS_ADD(stats, field, start) do { (stats)->field += pxl8_get_ticks_ns() - (start); } while (0)
#else
#define STATS_INC(stats, field, val) do { (void)(stats); } while (0)
#define STATS_START() 0
#define STATS_ADD(stats, field, start) do { (void)(stats); (void)(start); } while (0)
#endif
typedef struct {
pxl8_vec4 clip_pos;
pxl8_vec3 world_pos;
pxl8_vec3 normal;
f32 u, v;
u8 color;
u8 light;
} raster_vertex;
typedef struct {
pxl8_vec3 p0, p1, p2;
pxl8_vec3 w_recip;
pxl8_vec3 u_w, v_w;
pxl8_vec3 l_w;
pxl8_vec3 c_w;
pxl8_vec3 world0_w, world1_w, world2_w;
pxl8_vec3 normal;
i32 y_start, y_end;
f32 inv_total;
u32 target_width, target_height;
i32 clip_min_x, clip_min_y;
i32 clip_max_x, clip_max_y;
} tri_setup;
static inline bool depth_test_pass(pxl8_gfx_compare_func func, u16 src, u16 dst) {
switch (func) {
case PXL8_GFX_COMPARE_NEVER: return false;
case PXL8_GFX_COMPARE_LESS: return src < dst;
case PXL8_GFX_COMPARE_EQUAL: return src == dst;
case PXL8_GFX_COMPARE_LEQUAL: return src <= dst;
case PXL8_GFX_COMPARE_GREATER: return src > dst;
case PXL8_GFX_COMPARE_NOTEQUAL: return src != dst;
case PXL8_GFX_COMPARE_GEQUAL: return src >= dst;
case PXL8_GFX_COMPARE_ALWAYS: return true;
}
return true;
}
static inline f32 blend_factor_value(pxl8_gfx_blend_factor factor, f32 src_a, f32 dst_a) {
switch (factor) {
case PXL8_GFX_BLEND_ZERO: return 0.0f;
case PXL8_GFX_BLEND_ONE: return 1.0f;
case PXL8_GFX_BLEND_SRC_ALPHA: return src_a;
case PXL8_GFX_BLEND_INV_SRC_ALPHA: return 1.0f - src_a;
case PXL8_GFX_BLEND_DST_ALPHA: return dst_a;
case PXL8_GFX_BLEND_INV_DST_ALPHA: return 1.0f - dst_a;
}
return 1.0f;
}
static u8 palette_find_closest(const u32* palette, u8 r, u8 g, u8 b) {
if (!palette) return 0;
u8 best_idx = 1;
u32 best_dist = 0xFFFFFFFF;
for (u32 i = 1; i < 256; i++) {
u8 pr = palette[i] & 0xFF;
u8 pg = (palette[i] >> 8) & 0xFF;
u8 pb = (palette[i] >> 16) & 0xFF;
i32 dr = (i32)r - (i32)pr;
i32 dg = (i32)g - (i32)pg;
i32 db = (i32)b - (i32)pb;
u32 dist = (u32)(dr * dr + dg * dg + db * db);
if (dist < best_dist) {
best_dist = dist;
best_idx = (u8)i;
if (dist == 0) break;
}
}
return best_idx;
}
static u8 blend_indexed(
const pxl8_gfx_pipeline_desc* pipeline,
u8 src,
u8 dst,
const u32* palette,
const u8* colormap
) {
(void)colormap;
if (!pipeline || !pipeline->blend.enabled) return src;
if (src == 0) return dst;
if (!palette) return src;
f32 src_a = src == 0 ? 0.0f : 1.0f;
f32 dst_a = dst == 0 ? 0.0f : 1.0f;
f32 sf = blend_factor_value(pipeline->blend.src, src_a, dst_a);
f32 df = blend_factor_value(pipeline->blend.dst, src_a, dst_a);
if (sf == 1.0f && df == 0.0f) return src;
if (sf == 0.0f && df == 1.0f) return dst;
u8 sr = palette[src] & 0xFF;
u8 sg = (palette[src] >> 8) & 0xFF;
u8 sb = (palette[src] >> 16) & 0xFF;
u8 dr = palette[dst] & 0xFF;
u8 dg = (palette[dst] >> 8) & 0xFF;
u8 db = (palette[dst] >> 16) & 0xFF;
i32 out_r = (i32)(sr * sf + dr * df);
i32 out_g = (i32)(sg * sf + dg * df);
i32 out_b = (i32)(sb * sf + db * df);
if (out_r < 0) out_r = 0;
if (out_g < 0) out_g = 0;
if (out_b < 0) out_b = 0;
if (out_r > 255) out_r = 255;
if (out_g > 255) out_g = 255;
if (out_b > 255) out_b = 255;
return palette_find_closest(palette, (u8)out_r, (u8)out_g, (u8)out_b);
}
static inline pxl8_vec4 vec4_lerp(pxl8_vec4 a, pxl8_vec4 b, f32 t) {
return (pxl8_vec4){
a.x + (b.x - a.x) * t,
a.y + (b.y - a.y) * t,
a.z + (b.z - a.z) * t,
a.w + (b.w - a.w) * t
};
}
static raster_vertex lerp_raster_vertex(const raster_vertex* a, const raster_vertex* b, f32 t) {
return (raster_vertex){
.clip_pos = vec4_lerp(a->clip_pos, b->clip_pos, t),
.world_pos = pxl8_vec3_lerp(a->world_pos, b->world_pos, t),
.normal = pxl8_vec3_lerp(a->normal, b->normal, t),
.u = pxl8_lerp(a->u, b->u, t),
.v = pxl8_lerp(a->v, b->v, t),
.color = (u8)(a->color + (b->color - a->color) * t),
.light = (u8)(a->light + (b->light - a->light) * t),
};
}
static i32 clip_triangle_near(
const raster_vertex* v0, const raster_vertex* v1, const raster_vertex* v2,
f32 near, raster_vertex out[6]
) {
bool in0 = v0->clip_pos.w >= near;
bool in1 = v1->clip_pos.w >= near;
bool in2 = v2->clip_pos.w >= near;
i32 count = in0 + in1 + in2;
if (count == 0) return 0;
if (count == 3) {
out[0] = *v0; out[1] = *v1; out[2] = *v2;
return 3;
}
if (count == 1) {
const raster_vertex *inside, *out_a, *out_b;
if (in0) { inside = v0; out_a = v1; out_b = v2; }
else if (in1) { inside = v1; out_a = v2; out_b = v0; }
else { inside = v2; out_a = v0; out_b = v1; }
f32 t_a = (near - out_a->clip_pos.w) / (inside->clip_pos.w - out_a->clip_pos.w);
f32 t_b = (near - out_b->clip_pos.w) / (inside->clip_pos.w - out_b->clip_pos.w);
out[0] = *inside;
out[1] = lerp_raster_vertex(out_a, inside, t_a);
out[2] = lerp_raster_vertex(out_b, inside, t_b);
return 3;
}
const raster_vertex *outside, *in_a, *in_b;
if (!in0) { outside = v0; in_a = v1; in_b = v2; }
else if (!in1) { outside = v1; in_a = v2; in_b = v0; }
else { outside = v2; in_a = v0; in_b = v1; }
f32 t_a = (near - outside->clip_pos.w) / (in_a->clip_pos.w - outside->clip_pos.w);
f32 t_b = (near - outside->clip_pos.w) / (in_b->clip_pos.w - outside->clip_pos.w);
raster_vertex new_a = lerp_raster_vertex(outside, in_a, t_a);
raster_vertex new_b = lerp_raster_vertex(outside, in_b, t_b);
out[0] = *in_a; out[1] = *in_b; out[2] = new_b;
out[3] = *in_a; out[4] = new_b; out[5] = new_a;
return 6;
}
static bool setup_tri(
tri_setup* setup,
const raster_vertex* vo0, const raster_vertex* vo1, const raster_vertex* vo2,
i32 viewport_x, i32 viewport_y, u32 viewport_w, u32 viewport_h,
i32 clip_min_x, i32 clip_min_y, i32 clip_max_x, i32 clip_max_y,
pxl8_gfx_cull_mode cull, bool double_sided
) {
if (viewport_w == 0 || viewport_h == 0) return false;
f32 hw = (f32)viewport_w * 0.5f;
f32 hh = (f32)viewport_h * 0.5f;
setup->p0.x = (f32)viewport_x + hw + vo0->clip_pos.x / vo0->clip_pos.w * hw;
setup->p0.y = (f32)viewport_y + hh - vo0->clip_pos.y / vo0->clip_pos.w * hh;
setup->p0.z = vo0->clip_pos.z / vo0->clip_pos.w;
setup->p1.x = (f32)viewport_x + hw + vo1->clip_pos.x / vo1->clip_pos.w * hw;
setup->p1.y = (f32)viewport_y + hh - vo1->clip_pos.y / vo1->clip_pos.w * hh;
setup->p1.z = vo1->clip_pos.z / vo1->clip_pos.w;
setup->p2.x = (f32)viewport_x + hw + vo2->clip_pos.x / vo2->clip_pos.w * hw;
setup->p2.y = (f32)viewport_y + hh - vo2->clip_pos.y / vo2->clip_pos.w * hh;
setup->p2.z = vo2->clip_pos.z / vo2->clip_pos.w;
f32 cross = (setup->p1.x - setup->p0.x) * (setup->p2.y - setup->p0.y) -
(setup->p1.y - setup->p0.y) * (setup->p2.x - setup->p0.x);
if (!double_sided) {
if (cull == PXL8_GFX_CULL_BACK && cross >= 0.0f) return false;
if (cull == PXL8_GFX_CULL_FRONT && cross <= 0.0f) return false;
}
const raster_vertex* sorted[3] = {vo0, vo1, vo2};
if (setup->p0.y > setup->p1.y) {
pxl8_vec3 t = setup->p0; setup->p0 = setup->p1; setup->p1 = t;
const raster_vertex* tv = sorted[0]; sorted[0] = sorted[1]; sorted[1] = tv;
}
if (setup->p0.y > setup->p2.y) {
pxl8_vec3 t = setup->p0; setup->p0 = setup->p2; setup->p2 = t;
const raster_vertex* tv = sorted[0]; sorted[0] = sorted[2]; sorted[2] = tv;
}
if (setup->p1.y > setup->p2.y) {
pxl8_vec3 t = setup->p1; setup->p1 = setup->p2; setup->p2 = t;
const raster_vertex* tv = sorted[1]; sorted[1] = sorted[2]; sorted[2] = tv;
}
f32 total_height = setup->p2.y - setup->p0.y;
if (total_height < 1.0f) return false;
i32 y0_int = (i32)floorf(setup->p0.y);
i32 y2_int = (i32)ceilf(setup->p2.y) - 1;
setup->y_start = y0_int < clip_min_y ? clip_min_y : y0_int;
setup->y_end = y2_int > clip_max_y ? clip_max_y : y2_int;
setup->w_recip.x = 1.0f / sorted[0]->clip_pos.w;
setup->w_recip.y = 1.0f / sorted[1]->clip_pos.w;
setup->w_recip.z = 1.0f / sorted[2]->clip_pos.w;
setup->u_w.x = sorted[0]->u * setup->w_recip.x;
setup->v_w.x = sorted[0]->v * setup->w_recip.x;
setup->u_w.y = sorted[1]->u * setup->w_recip.y;
setup->v_w.y = sorted[1]->v * setup->w_recip.y;
setup->u_w.z = sorted[2]->u * setup->w_recip.z;
setup->v_w.z = sorted[2]->v * setup->w_recip.z;
setup->l_w.x = sorted[0]->light * setup->w_recip.x;
setup->l_w.y = sorted[1]->light * setup->w_recip.y;
setup->l_w.z = sorted[2]->light * setup->w_recip.z;
setup->c_w.x = (f32)sorted[0]->color * setup->w_recip.x;
setup->c_w.y = (f32)sorted[1]->color * setup->w_recip.y;
setup->c_w.z = (f32)sorted[2]->color * setup->w_recip.z;
setup->world0_w = pxl8_vec3_scale(sorted[0]->world_pos, setup->w_recip.x);
setup->world1_w = pxl8_vec3_scale(sorted[1]->world_pos, setup->w_recip.y);
setup->world2_w = pxl8_vec3_scale(sorted[2]->world_pos, setup->w_recip.z);
setup->normal = sorted[0]->normal;
setup->inv_total = 1.0f / total_height;
setup->target_width = viewport_w;
setup->target_height = viewport_h;
setup->clip_min_x = clip_min_x;
setup->clip_min_y = clip_min_y;
setup->clip_max_x = clip_max_x;
setup->clip_max_y = clip_max_y;
return true;
}
static void rasterize_triangle(
const tri_setup* setup,
u8* fb,
u16* zb,
u32 fb_width,
pxl8_shader_fn shader,
const pxl8_gfx_pipeline_desc* pipeline,
const pxl8_shader_bindings* bindings,
const pxl8_shader_uniforms* uniforms,
pxl8_gfx_stats* stats
) {
const i32 SUBDIV = 16;
if (setup->y_start > setup->y_end) return;
bool depth_test = pipeline && pipeline->depth.test;
bool depth_write = pipeline && pipeline->depth.write;
pxl8_gfx_compare_func depth_compare = pipeline ? pipeline->depth.compare : PXL8_GFX_COMPARE_ALWAYS;
bool alpha_test = pipeline && pipeline->blend.alpha_test;
u8 alpha_ref = pipeline ? pipeline->blend.alpha_ref : 0;
bool blend_enabled = pipeline && pipeline->blend.enabled;
const u32* palette = bindings ? bindings->palette : NULL;
const u8* colormap = bindings ? bindings->colormap : NULL;
for (i32 y = setup->y_start; y <= setup->y_end; y++) {
f32 yf = (f32)y + 0.5f;
f32 alpha = (yf - setup->p0.y) * setup->inv_total;
f32 ax = setup->p0.x + (setup->p2.x - setup->p0.x) * alpha;
f32 az = setup->p0.z + (setup->p2.z - setup->p0.z) * alpha;
f32 a_wr = setup->w_recip.x + (setup->w_recip.z - setup->w_recip.x) * alpha;
f32 a_uw = setup->u_w.x + (setup->u_w.z - setup->u_w.x) * alpha;
f32 a_vw = setup->v_w.x + (setup->v_w.z - setup->v_w.x) * alpha;
f32 a_lw = setup->l_w.x + (setup->l_w.z - setup->l_w.x) * alpha;
f32 a_cw = setup->c_w.x + (setup->c_w.z - setup->c_w.x) * alpha;
f32 a_wxw = setup->world0_w.x + (setup->world2_w.x - setup->world0_w.x) * alpha;
f32 a_wyw = setup->world0_w.y + (setup->world2_w.y - setup->world0_w.y) * alpha;
f32 a_wzw = setup->world0_w.z + (setup->world2_w.z - setup->world0_w.z) * alpha;
f32 bx, bz, b_wr, b_uw, b_vw, b_lw, b_cw, b_wxw, b_wyw, b_wzw;
bool second_half = yf > setup->p1.y || setup->p1.y == setup->p0.y;
f32 segment_height = second_half ? (setup->p2.y - setup->p1.y) : (setup->p1.y - setup->p0.y);
if (segment_height < 0.001f) segment_height = 0.001f;
f32 beta = (yf - (second_half ? setup->p1.y : setup->p0.y)) / segment_height;
if (beta < 0.0f) beta = 0.0f;
if (beta > 1.0f) beta = 1.0f;
if (second_half) {
bx = setup->p1.x + (setup->p2.x - setup->p1.x) * beta;
bz = setup->p1.z + (setup->p2.z - setup->p1.z) * beta;
b_wr = setup->w_recip.y + (setup->w_recip.z - setup->w_recip.y) * beta;
b_uw = setup->u_w.y + (setup->u_w.z - setup->u_w.y) * beta;
b_vw = setup->v_w.y + (setup->v_w.z - setup->v_w.y) * beta;
b_lw = setup->l_w.y + (setup->l_w.z - setup->l_w.y) * beta;
b_cw = setup->c_w.y + (setup->c_w.z - setup->c_w.y) * beta;
b_wxw = setup->world1_w.x + (setup->world2_w.x - setup->world1_w.x) * beta;
b_wyw = setup->world1_w.y + (setup->world2_w.y - setup->world1_w.y) * beta;
b_wzw = setup->world1_w.z + (setup->world2_w.z - setup->world1_w.z) * beta;
} else {
bx = setup->p0.x + (setup->p1.x - setup->p0.x) * beta;
bz = setup->p0.z + (setup->p1.z - setup->p0.z) * beta;
b_wr = setup->w_recip.x + (setup->w_recip.y - setup->w_recip.x) * beta;
b_uw = setup->u_w.x + (setup->u_w.y - setup->u_w.x) * beta;
b_vw = setup->v_w.x + (setup->v_w.y - setup->v_w.x) * beta;
b_lw = setup->l_w.x + (setup->l_w.y - setup->l_w.x) * beta;
b_cw = setup->c_w.x + (setup->c_w.y - setup->c_w.x) * beta;
b_wxw = setup->world0_w.x + (setup->world1_w.x - setup->world0_w.x) * beta;
b_wyw = setup->world0_w.y + (setup->world1_w.y - setup->world0_w.y) * beta;
b_wzw = setup->world0_w.z + (setup->world1_w.z - setup->world0_w.z) * beta;
}
f32 x_start_fp, x_end_fp, z_start, z_end;
f32 wr_start, wr_end, uw_start, uw_end, vw_start, vw_end, lw_start, lw_end, cw_start, cw_end;
f32 wxw_start, wxw_end, wyw_start, wyw_end, wzw_start, wzw_end;
if (ax <= bx) {
x_start_fp = ax; x_end_fp = bx; z_start = az; z_end = bz;
wr_start = a_wr; wr_end = b_wr;
uw_start = a_uw; uw_end = b_uw;
vw_start = a_vw; vw_end = b_vw;
lw_start = a_lw; lw_end = b_lw;
cw_start = a_cw; cw_end = b_cw;
wxw_start = a_wxw; wxw_end = b_wxw;
wyw_start = a_wyw; wyw_end = b_wyw;
wzw_start = a_wzw; wzw_end = b_wzw;
} else {
x_start_fp = bx; x_end_fp = ax; z_start = bz; z_end = az;
wr_start = b_wr; wr_end = a_wr;
uw_start = b_uw; uw_end = a_uw;
vw_start = b_vw; vw_end = a_vw;
lw_start = b_lw; lw_end = a_lw;
cw_start = b_cw; cw_end = a_cw;
wxw_start = b_wxw; wxw_end = a_wxw;
wyw_start = b_wyw; wyw_end = a_wyw;
wzw_start = b_wzw; wzw_end = a_wzw;
}
i32 x_start = (i32)floorf(x_start_fp);
i32 x_end = (i32)ceilf(x_end_fp) - 1;
if (x_start < setup->clip_min_x) x_start = setup->clip_min_x;
if (x_end > setup->clip_max_x) x_end = setup->clip_max_x;
if (x_start > x_end) continue;
f32 span_width = x_end_fp - x_start_fp;
if (span_width < 1.0f) span_width = 1.0f;
f32 inv_width = 1.0f / span_width;
f32 dz = (z_end - z_start) * inv_width;
f32 dwr = (wr_end - wr_start) * inv_width;
f32 duw = (uw_end - uw_start) * inv_width;
f32 dvw = (vw_end - vw_start) * inv_width;
f32 dlw = (lw_end - lw_start) * inv_width;
f32 dcw = (cw_end - cw_start) * inv_width;
f32 dwxw = (wxw_end - wxw_start) * inv_width;
f32 dwyw = (wyw_end - wyw_start) * inv_width;
f32 dwzw = (wzw_end - wzw_start) * inv_width;
f32 skip = (f32)x_start + 0.5f - x_start_fp;
f32 z = z_start + dz * skip;
f32 wr = wr_start + dwr * skip;
f32 uw = uw_start + duw * skip;
f32 vw = vw_start + dvw * skip;
f32 lw = lw_start + dlw * skip;
f32 cw = cw_start + dcw * skip;
f32 wxw = wxw_start + dwxw * skip;
f32 wyw = wyw_start + dwyw * skip;
f32 wzw = wzw_start + dwzw * skip;
u32 row_start = (u32)y * fb_width;
u8* prow = fb + row_start;
u16* zrow = zb + row_start;
i32 x = x_start;
while (x <= x_end) {
i32 span_end = x + SUBDIV - 1;
if (span_end > x_end) span_end = x_end;
i32 span_len = span_end - x + 1;
f32 pw_start = 1.0f / wr;
f32 pw_end = 1.0f / (wr + dwr * (f32)span_len);
2026-02-02 17:48:25 -06:00
f32 u_start = uw * pw_start;
f32 v_start = vw * pw_start;
f32 u_end = (uw + duw * (f32)span_len) * pw_end;
f32 v_end = (vw + dvw * (f32)span_len) * pw_end;
f32 l_start_fp = pxl8_clamp(lw * pw_start, 0.0f, 255.0f);
f32 l_end_fp = pxl8_clamp((lw + dlw * (f32)span_len) * pw_end, 0.0f, 255.0f);
f32 c_start_fp = pxl8_clamp(cw * pw_start, 0.0f, 255.0f);
f32 c_end_fp = pxl8_clamp((cw + dcw * (f32)span_len) * pw_end, 0.0f, 255.0f);
f32 wx_start = wxw * pw_start;
f32 wy_start = wyw * pw_start;
f32 wz_start = wzw * pw_start;
f32 wx_end = (wxw + dwxw * (f32)span_len) * pw_end;
f32 wy_end = (wyw + dwyw * (f32)span_len) * pw_end;
f32 wz_end = (wzw + dwzw * (f32)span_len) * pw_end;
f32 inv_span = span_len > 1 ? 1.0f / (f32)(span_len - 1) : 0.0f;
f32 du = (u_end - u_start) * inv_span;
f32 dv = (v_end - v_start) * inv_span;
f32 dl = (l_end_fp - l_start_fp) * inv_span;
f32 dc = (c_end_fp - c_start_fp) * inv_span;
f32 dwx = (wx_end - wx_start) * inv_span;
f32 dwy = (wy_end - wy_start) * inv_span;
f32 dwz = (wz_end - wz_start) * inv_span;
f32 u_a = u_start;
f32 v_a = v_start;
f32 l_a = l_start_fp;
f32 c_a = c_start_fp;
f32 z_a = z;
f32 wx_a = wx_start;
f32 wy_a = wy_start;
f32 wz_a = wz_start;
i32 px = x;
#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON)
if (depth_test && depth_compare == PXL8_GFX_COMPARE_LESS && !blend_enabled) {
pxl8_f32_simd dz4_simd = pxl8_f32_simd_set(dz * 4.0f);
pxl8_f32_simd half = pxl8_f32_simd_set(0.5f);
pxl8_f32_simd one = pxl8_f32_simd_set(1.0f);
pxl8_f32_simd zero = pxl8_f32_simd_zero();
pxl8_f32_simd scale65535 = pxl8_f32_simd_set(65535.0f);
pxl8_f32_simd z4 = pxl8_f32_simd_set4(z_a, z_a + dz, z_a + dz * 2.0f, z_a + dz * 3.0f);
pxl8_f32_simd offsets = pxl8_f32_simd_set4(0.0f, 1.0f, 2.0f, 3.0f);
f32 du4 = du * 4.0f, dv4 = dv * 4.0f, dl4 = dl * 4.0f, dc4 = dc * 4.0f;
f32 dz4 = dz * 4.0f, dwx4 = dwx * 4.0f, dwy4 = dwy * 4.0f, dwz4 = dwz * 4.0f;
for (; px + 3 <= span_end; px += 4) {
pxl8_f32_simd depth_norm = pxl8_f32_simd_clamp(pxl8_f32_simd_mul(pxl8_f32_simd_add(z4, one), half), zero, one);
pxl8_i32_simd z16_4 = pxl8_f32_simd_to_i32(pxl8_f32_simd_mul(depth_norm, scale65535));
pxl8_i32_simd zbuf = pxl8_i32_simd_set4((i32)zrow[px], (i32)zrow[px+1], (i32)zrow[px+2], (i32)zrow[px+3]);
i32 mask = pxl8_i32_simd_movemask(pxl8_i32_simd_cmpgt(zbuf, z16_4));
STATS_INC(stats, depth_tests, 4);
if (mask) {
pxl8_shader_ctx frag_ctx = {
.color_count = 4,
.x = pxl8_i32_simd_set4(px, px + 1, px + 2, px + 3),
.y = pxl8_i32_simd_set(y),
.v_uv = {
pxl8_f32_simd_add(pxl8_f32_simd_set(u_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(du), offsets)),
pxl8_f32_simd_add(pxl8_f32_simd_set(v_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dv), offsets))
},
.v_world = {
pxl8_f32_simd_add(pxl8_f32_simd_set(wx_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwx), offsets)),
pxl8_f32_simd_add(pxl8_f32_simd_set(wy_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwy), offsets)),
pxl8_f32_simd_add(pxl8_f32_simd_set(wz_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwz), offsets))
},
.v_normal = pxl8_vec3_simd_set(setup->normal),
.v_light = pxl8_f32_simd_mul(
pxl8_f32_simd_add(pxl8_f32_simd_set(l_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dl), offsets)),
pxl8_f32_simd_set(1.0f / 255.0f)
),
.v_color = pxl8_f32_simd_add(pxl8_f32_simd_set(c_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dc), offsets)),
.v_depth = z4,
};
u8 colors[4];
shader(&frag_ctx, bindings, uniforms, colors);
STATS_INC(stats, shader_calls, 1);
i32 z16_arr[4];
pxl8_i32_simd_store(z16_arr, z16_4);
for (i32 i = 0; i < 4; i++) {
if (!(mask & (0x8 << (i * 4)))) continue;
STATS_INC(stats, depth_passes, 1);
u8 color = colors[i];
if (!(alpha_test && color <= alpha_ref) && color != 0) {
prow[px + i] = color;
if (depth_write) zrow[px + i] = (u16)z16_arr[i];
STATS_INC(stats, pixels_written, 1);
}
}
}
u_a += du4; v_a += dv4; l_a += dl4; c_a += dc4;
z_a += dz4; wx_a += dwx4; wy_a += dwy4; wz_a += dwz4;
z4 = pxl8_f32_simd_add(z4, dz4_simd);
}
}
#endif
for (; px <= span_end; px++) {
2026-02-02 17:48:25 -06:00
f32 depth_norm = pxl8_clamp((z_a + 1.0f) * 0.5f, 0.0f, 1.0f);
u16 z16 = (u16)(depth_norm * 65535.0f);
STATS_INC(stats, depth_tests, 1);
bool depth_pass = !depth_test || depth_test_pass(depth_compare, z16, zrow[px]);
if (depth_pass) {
STATS_INC(stats, depth_passes, 1);
pxl8_shader_ctx frag_ctx = {
.color_count = 1,
.x = pxl8_i32_simd_set(px),
.y = pxl8_i32_simd_set(y),
.v_uv = { pxl8_f32_simd_set(u_a), pxl8_f32_simd_set(v_a) },
.v_world = { pxl8_f32_simd_set(wx_a), pxl8_f32_simd_set(wy_a), pxl8_f32_simd_set(wz_a) },
.v_normal = pxl8_vec3_simd_set(setup->normal),
.v_light = pxl8_f32_simd_set(l_a / 255.0f),
.v_color = pxl8_f32_simd_set(c_a),
.v_depth = pxl8_f32_simd_set(z_a),
2026-02-02 17:48:25 -06:00
};
u8 color;
shader(&frag_ctx, bindings, uniforms, &color);
2026-02-02 17:48:25 -06:00
STATS_INC(stats, shader_calls, 1);
if (!(alpha_test && color <= alpha_ref)) {
if (color != 0) {
u8 out_color = color;
if (blend_enabled) {
out_color = blend_indexed(pipeline, color, prow[px], palette, colormap);
}
prow[px] = out_color;
if (depth_write) {
zrow[px] = z16;
}
STATS_INC(stats, pixels_written, 1);
}
}
}
u_a += du;
v_a += dv;
l_a += dl;
c_a += dc;
z_a += dz;
wx_a += dwx;
wy_a += dwy;
wz_a += dwz;
}
wr += dwr * (f32)span_len;
uw += duw * (f32)span_len;
vw += dvw * (f32)span_len;
lw += dlw * (f32)span_len;
cw += dcw * (f32)span_len;
z += dz * (f32)span_len;
wxw += dwxw * (f32)span_len;
wyw += dwyw * (f32)span_len;
wzw += dwzw * (f32)span_len;
x = span_end + 1;
}
}
}
static void draw_line_clipped(
u8* fb,
u32 fb_w,
u32 fb_h,
i32 x0,
i32 y0,
i32 x1,
i32 y1,
u8 color,
i32 clip_min_x,
i32 clip_min_y,
i32 clip_max_x,
i32 clip_max_y,
pxl8_gfx_stats* stats
) {
i32 dx = abs(x1 - x0);
i32 dy = -abs(y1 - y0);
i32 sx = x0 < x1 ? 1 : -1;
i32 sy = y0 < y1 ? 1 : -1;
i32 err = dx + dy;
while (true) {
if (x0 >= clip_min_x && x0 <= clip_max_x && y0 >= clip_min_y && y0 <= clip_max_y) {
if (x0 >= 0 && y0 >= 0 && x0 < (i32)fb_w && y0 < (i32)fb_h) {
fb[y0 * (i32)fb_w + x0] = color;
STATS_INC(stats, pixels_written, 1);
}
}
if (x0 == x1 && y0 == y1) break;
i32 e2 = 2 * err;
if (e2 >= dy) { err += dy; x0 += sx; }
if (e2 <= dx) { err += dx; y0 += sy; }
}
}
#define SLOT_INDEX(id) ((id) & 0xFFFF)
#define SLOT_GEN(id) ((id) >> 16)
#define MAKE_ID(index, gen) (((u32)(gen) << 16) | (u32)(index))
#define NEXT_GEN(gen) ((u16)((gen) == 0xFFFF ? 1 : (gen) + 1))
typedef struct {
pxl8_gfx_bindings_desc desc;
u16 generation;
bool active;
} bindings_slot;
typedef struct {
void* data;
u32 size;
u32 append_pos;
pxl8_gfx_buffer_type type;
pxl8_gfx_usage usage;
u16 generation;
bool active;
} buffer_slot;
typedef struct {
pxl8_gfx_pass_desc desc;
u16 generation;
bool active;
} pass_slot;
#define PXL8_PIPELINE_CACHE_SIZE 64
typedef struct {
u32 hash;
u32 slot_idx;
u32 last_used_frame;
bool valid;
} pipeline_cache_entry;
typedef struct {
pxl8_gfx_pipeline_desc desc;
u16 generation;
bool active;
bool cached;
} pipeline_slot;
typedef struct {
void* data;
u32 width;
u32 height;
pxl8_gfx_texture_format format;
pxl8_gfx_usage usage;
u16 generation;
bool active;
} texture_slot;
struct pxl8_renderer {
u32 width;
u32 height;
texture_slot textures[PXL8_GFX_MAX_TEXTURES];
buffer_slot buffers[PXL8_GFX_MAX_BUFFERS];
pipeline_slot pipelines[PXL8_GFX_MAX_PIPELINES];
bindings_slot bindings[PXL8_GFX_MAX_BINDINGS];
pass_slot passes[PXL8_GFX_MAX_PASSES];
pipeline_cache_entry pipeline_cache[PXL8_PIPELINE_CACHE_SIZE];
u32 frame_counter;
pxl8_gfx_pass current_pass;
pxl8_gfx_pipeline current_pipeline;
pxl8_gfx_bindings current_bindings;
pxl8_gfx_cmd_draw_params current_draw_params;
i32 viewport_x, viewport_y;
u32 viewport_w, viewport_h;
i32 scissor_x, scissor_y;
u32 scissor_w, scissor_h;
pxl8_shader_fn shader;
pxl8_gfx_stats stats;
};
struct pxl8_gfx_cmdbuf {
pxl8_gfx_cmd* commands;
u32 capacity;
u32 count;
};
pxl8_renderer* pxl8_renderer_create(u32 width, u32 height) {
pxl8_renderer* r = pxl8_calloc(1, sizeof(pxl8_renderer));
r->width = width;
r->height = height;
r->viewport_w = width;
r->viewport_h = height;
r->scissor_w = width;
r->scissor_h = height;
pxl8_renderer_reset_stats(r);
return r;
}
void pxl8_renderer_destroy(pxl8_renderer* r) {
if (!r) return;
for (u32 i = 0; i < PXL8_GFX_MAX_TEXTURES; i++) {
if (r->textures[i].data) pxl8_free(r->textures[i].data);
}
for (u32 i = 0; i < PXL8_GFX_MAX_BUFFERS; i++) {
if (r->buffers[i].data) pxl8_free(r->buffers[i].data);
}
pxl8_free(r);
}
u32 pxl8_renderer_get_width(const pxl8_renderer* r) {
return r ? r->width : 0;
}
u32 pxl8_renderer_get_height(const pxl8_renderer* r) {
return r ? r->height : 0;
}
void pxl8_renderer_set_shader(pxl8_renderer* r, pxl8_shader_fn fn) {
if (r) r->shader = fn;
}
void pxl8_renderer_reset_stats(pxl8_renderer* r) {
if (!r) return;
memset(&r->stats, 0, sizeof(r->stats));
}
const pxl8_gfx_stats* pxl8_renderer_get_stats(const pxl8_renderer* r) {
return r ? &r->stats : NULL;
}
static u32 texture_byte_size(pxl8_gfx_texture_format fmt, u32 w, u32 h) {
switch (fmt) {
case PXL8_GFX_FORMAT_INDEXED8: return w * h;
case PXL8_GFX_FORMAT_DEPTH16: return w * h * 2;
case PXL8_GFX_FORMAT_LIGHT_ACCUM: return w * h * 4;
}
return 0;
}
pxl8_gfx_texture pxl8_create_texture(pxl8_renderer* r, const pxl8_gfx_texture_desc* desc) {
for (u32 i = 0; i < PXL8_GFX_MAX_TEXTURES; i++) {
if (!r->textures[i].active) {
texture_slot* s = &r->textures[i];
u32 size = texture_byte_size(desc->format, desc->width, desc->height);
s->data = pxl8_malloc(size);
if (desc->data.ptr && desc->data.size >= size) {
memcpy(s->data, desc->data.ptr, size);
} else {
memset(s->data, 0, size);
}
s->width = desc->width;
s->height = desc->height;
s->format = desc->format;
s->usage = desc->usage;
s->generation = NEXT_GEN(s->generation);
s->active = true;
return (pxl8_gfx_texture){ MAKE_ID(i, s->generation) };
}
}
pxl8_error("Out of texture slots");
return (pxl8_gfx_texture){ PXL8_GFX_INVALID_ID };
}
pxl8_gfx_buffer pxl8_create_buffer(pxl8_renderer* r, const pxl8_gfx_buffer_desc* desc) {
for (u32 i = 0; i < PXL8_GFX_MAX_BUFFERS; i++) {
if (!r->buffers[i].active) {
buffer_slot* s = &r->buffers[i];
u32 capacity = desc->capacity > 0 ? desc->capacity : desc->data.size;
s->data = pxl8_malloc(capacity);
if (desc->data.ptr && desc->data.size > 0) {
memcpy(s->data, desc->data.ptr, desc->data.size);
if (capacity > desc->data.size) {
memset((u8*)s->data + desc->data.size, 0, capacity - desc->data.size);
}
} else {
memset(s->data, 0, capacity);
}
s->size = capacity;
s->append_pos = 0;
s->type = desc->type;
s->usage = desc->usage;
s->generation = NEXT_GEN(s->generation);
s->active = true;
return (pxl8_gfx_buffer){ MAKE_ID(i, s->generation) };
}
}
pxl8_error("Out of buffer slots");
return (pxl8_gfx_buffer){ PXL8_GFX_INVALID_ID };
}
pxl8_gfx_pipeline pxl8_create_pipeline(pxl8_renderer* r, const pxl8_gfx_pipeline_desc* desc) {
for (u32 i = 0; i < PXL8_GFX_MAX_PIPELINES; i++) {
if (!r->pipelines[i].active) {
pipeline_slot* s = &r->pipelines[i];
s->desc = *desc;
s->generation = NEXT_GEN(s->generation);
s->active = true;
return (pxl8_gfx_pipeline){ MAKE_ID(i, s->generation) };
}
}
pxl8_error("Out of pipeline slots");
return (pxl8_gfx_pipeline){ PXL8_GFX_INVALID_ID };
}
pxl8_gfx_bindings pxl8_create_bindings(pxl8_renderer* r, const pxl8_gfx_bindings_desc* desc) {
for (u32 i = 0; i < PXL8_GFX_MAX_BINDINGS; i++) {
if (!r->bindings[i].active) {
bindings_slot* s = &r->bindings[i];
s->desc = *desc;
s->generation = NEXT_GEN(s->generation);
s->active = true;
return (pxl8_gfx_bindings){ MAKE_ID(i, s->generation) };
}
}
pxl8_error("Out of bindings slots");
return (pxl8_gfx_bindings){ PXL8_GFX_INVALID_ID };
}
pxl8_gfx_pass pxl8_create_pass(pxl8_renderer* r, const pxl8_gfx_pass_desc* desc) {
for (u32 i = 0; i < PXL8_GFX_MAX_PASSES; i++) {
if (!r->passes[i].active) {
pass_slot* s = &r->passes[i];
s->desc = *desc;
s->generation = NEXT_GEN(s->generation);
s->active = true;
return (pxl8_gfx_pass){ MAKE_ID(i, s->generation) };
}
}
pxl8_error("Out of pass slots");
return (pxl8_gfx_pass){ PXL8_GFX_INVALID_ID };
}
#define VALID_TEX(r, h) (SLOT_INDEX((h).id) < PXL8_GFX_MAX_TEXTURES && \
r->textures[SLOT_INDEX((h).id)].active && \
r->textures[SLOT_INDEX((h).id)].generation == SLOT_GEN((h).id))
#define VALID_BUF(r, h) (SLOT_INDEX((h).id) < PXL8_GFX_MAX_BUFFERS && \
r->buffers[SLOT_INDEX((h).id)].active && \
r->buffers[SLOT_INDEX((h).id)].generation == SLOT_GEN((h).id))
#define VALID_PASS(r, h) (SLOT_INDEX((h).id) < PXL8_GFX_MAX_PASSES && \
r->passes[SLOT_INDEX((h).id)].active && \
r->passes[SLOT_INDEX((h).id)].generation == SLOT_GEN((h).id))
#define VALID_PIPELINE(r, h) (SLOT_INDEX((h).id) < PXL8_GFX_MAX_PIPELINES && \
r->pipelines[SLOT_INDEX((h).id)].active && \
r->pipelines[SLOT_INDEX((h).id)].generation == SLOT_GEN((h).id))
#define VALID_BINDINGS(r, h) (SLOT_INDEX((h).id) < PXL8_GFX_MAX_BINDINGS && \
r->bindings[SLOT_INDEX((h).id)].active && \
r->bindings[SLOT_INDEX((h).id)].generation == SLOT_GEN((h).id))
void pxl8_destroy_texture(pxl8_renderer* r, pxl8_gfx_texture tex) {
if (!VALID_TEX(r, tex)) return;
texture_slot* s = &r->textures[SLOT_INDEX(tex.id)];
pxl8_free(s->data);
s->data = NULL;
s->active = false;
}
void pxl8_destroy_buffer(pxl8_renderer* r, pxl8_gfx_buffer buf) {
if (!VALID_BUF(r, buf)) return;
buffer_slot* s = &r->buffers[SLOT_INDEX(buf.id)];
pxl8_free(s->data);
s->data = NULL;
s->active = false;
}
void pxl8_destroy_pipeline(pxl8_renderer* r, pxl8_gfx_pipeline pip) {
u32 idx = SLOT_INDEX(pip.id);
if (idx < PXL8_GFX_MAX_PIPELINES && r->pipelines[idx].generation == SLOT_GEN(pip.id)) {
r->pipelines[idx].active = false;
}
}
void pxl8_destroy_bindings(pxl8_renderer* r, pxl8_gfx_bindings bnd) {
u32 idx = SLOT_INDEX(bnd.id);
if (idx < PXL8_GFX_MAX_BINDINGS && r->bindings[idx].generation == SLOT_GEN(bnd.id)) {
r->bindings[idx].active = false;
}
}
void pxl8_destroy_pass(pxl8_renderer* r, pxl8_gfx_pass pass) {
u32 idx = SLOT_INDEX(pass.id);
if (idx < PXL8_GFX_MAX_PASSES && r->passes[idx].generation == SLOT_GEN(pass.id)) {
r->passes[idx].active = false;
}
}
void pxl8_update_buffer(pxl8_renderer* r, pxl8_gfx_buffer buf, const pxl8_gfx_range* data) {
if (!VALID_BUF(r, buf)) return;
buffer_slot* s = &r->buffers[SLOT_INDEX(buf.id)];
u32 copy_size = data->size < s->size ? data->size : s->size;
memcpy(s->data, data->ptr, copy_size);
}
i32 pxl8_append_buffer(pxl8_renderer* r, pxl8_gfx_buffer buf, const pxl8_gfx_range* data) {
if (!VALID_BUF(r, buf)) return -1;
buffer_slot* s = &r->buffers[SLOT_INDEX(buf.id)];
if (s->append_pos + data->size > s->size) return -1;
i32 offset = (i32)s->append_pos;
memcpy((u8*)s->data + s->append_pos, data->ptr, data->size);
s->append_pos += data->size;
return offset;
}
void pxl8_update_texture(pxl8_renderer* r, pxl8_gfx_texture tex, const pxl8_gfx_range* data, u32 x, u32 y, u32 w, u32 h) {
if (!VALID_TEX(r, tex)) return;
texture_slot* s = &r->textures[SLOT_INDEX(tex.id)];
u32 bpp = (s->format == PXL8_GFX_FORMAT_INDEXED8) ? 1 :
(s->format == PXL8_GFX_FORMAT_DEPTH16) ? 2 : 4;
const u8* src = data->ptr;
u8* dst = (u8*)s->data + (y * s->width + x) * bpp;
for (u32 row = 0; row < h; row++) {
memcpy(dst, src, w * bpp);
src += w * bpp;
dst += s->width * bpp;
}
}
void* pxl8_buffer_ptr(pxl8_renderer* r, pxl8_gfx_buffer buf) {
if (!VALID_BUF(r, buf)) return NULL;
return r->buffers[SLOT_INDEX(buf.id)].data;
}
u32 pxl8_buffer_size(pxl8_renderer* r, pxl8_gfx_buffer buf) {
if (!VALID_BUF(r, buf)) return 0;
return r->buffers[SLOT_INDEX(buf.id)].size;
}
void* pxl8_texture_get_data(pxl8_renderer* r, pxl8_gfx_texture tex) {
if (!VALID_TEX(r, tex)) return NULL;
return r->textures[SLOT_INDEX(tex.id)].data;
}
u32 pxl8_texture_get_width(pxl8_renderer* r, pxl8_gfx_texture tex) {
if (!VALID_TEX(r, tex)) return 0;
return r->textures[SLOT_INDEX(tex.id)].width;
}
u32 pxl8_texture_get_height(pxl8_renderer* r, pxl8_gfx_texture tex) {
if (!VALID_TEX(r, tex)) return 0;
return r->textures[SLOT_INDEX(tex.id)].height;
}
pxl8_gfx_texture_format pxl8_texture_get_format(pxl8_renderer* r, pxl8_gfx_texture tex) {
if (!VALID_TEX(r, tex)) return PXL8_GFX_FORMAT_INDEXED8;
return r->textures[SLOT_INDEX(tex.id)].format;
}
pxl8_gfx_cmdbuf* pxl8_cmdbuf_create(u32 capacity) {
pxl8_gfx_cmdbuf* cb = pxl8_malloc(sizeof(pxl8_gfx_cmdbuf));
cb->commands = pxl8_malloc(capacity * sizeof(pxl8_gfx_cmd));
cb->capacity = capacity;
cb->count = 0;
return cb;
}
void pxl8_cmdbuf_destroy(pxl8_gfx_cmdbuf* cb) {
if (!cb) return;
pxl8_free(cb->commands);
pxl8_free(cb);
}
void pxl8_cmdbuf_reset(pxl8_gfx_cmdbuf* cb) {
if (cb) {
cb->count = 0;
}
}
static pxl8_gfx_cmd* cmd_alloc(pxl8_gfx_cmdbuf* cb) {
if (cb->count >= cb->capacity) {
cb->capacity *= 2;
cb->commands = pxl8_realloc(cb->commands, cb->capacity * sizeof(pxl8_gfx_cmd));
}
return &cb->commands[cb->count++];
}
void pxl8_begin_pass(pxl8_gfx_cmdbuf* cb, pxl8_gfx_pass pass) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_BEGIN_PASS;
cmd->begin_pass.pass = pass;
}
void pxl8_end_pass(pxl8_gfx_cmdbuf* cb) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_END_PASS;
}
void pxl8_set_pipeline(pxl8_gfx_cmdbuf* cb, pxl8_gfx_pipeline pipeline) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_SET_PIPELINE;
cmd->set_pipeline.pipeline = pipeline;
}
void pxl8_set_bindings(pxl8_gfx_cmdbuf* cb, pxl8_gfx_bindings bindings) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_SET_BINDINGS;
cmd->set_bindings.bindings = bindings;
}
void pxl8_set_viewport(pxl8_gfx_cmdbuf* cb, i32 x, i32 y, u32 w, u32 h) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_SET_VIEWPORT;
cmd->set_viewport.x = x;
cmd->set_viewport.y = y;
cmd->set_viewport.w = w;
cmd->set_viewport.h = h;
}
void pxl8_set_scissor(pxl8_gfx_cmdbuf* cb, i32 x, i32 y, u32 w, u32 h) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_SET_SCISSOR;
cmd->set_scissor.x = x;
cmd->set_scissor.y = y;
cmd->set_scissor.w = w;
cmd->set_scissor.h = h;
}
void pxl8_set_draw_params(pxl8_gfx_cmdbuf* cb, const pxl8_gfx_cmd_draw_params* p) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_SET_DRAW_PARAMS;
cmd->draw_params = *p;
}
void pxl8_draw(pxl8_gfx_cmdbuf* cb, pxl8_gfx_buffer vb, pxl8_gfx_buffer ib, u32 first, u32 count, u32 base_vertex) {
pxl8_gfx_cmd* cmd = cmd_alloc(cb);
cmd->type = PXL8_GFX_CMD_DRAW;
cmd->draw.vertex_buffer = vb;
cmd->draw.index_buffer = ib;
cmd->draw.first_index = first;
cmd->draw.index_count = count;
cmd->draw.base_vertex = base_vertex;
}
static void execute_draw(
pxl8_renderer* r,
const pxl8_gfx_cmd_draw* cmd
) {
if (!VALID_BUF(r, cmd->vertex_buffer)) return;
bool use_indices = pxl8_gfx_handle_valid(cmd->index_buffer) && VALID_BUF(r, cmd->index_buffer);
if (!VALID_PASS(r, r->current_pass)) return;
if (!VALID_PIPELINE(r, r->current_pipeline)) return;
u64 exec_start = STATS_START();
STATS_INC(&r->stats, draw_calls, 1);
buffer_slot* vb = &r->buffers[SLOT_INDEX(cmd->vertex_buffer.id)];
buffer_slot* ib = use_indices ? &r->buffers[SLOT_INDEX(cmd->index_buffer.id)] : NULL;
pass_slot* pass = &r->passes[SLOT_INDEX(r->current_pass.id)];
pipeline_slot* pip = &r->pipelines[SLOT_INDEX(r->current_pipeline.id)];
if (!VALID_TEX(r, pass->desc.color.texture)) {
pxl8_error("draw: invalid color texture");
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
return;
}
if (!VALID_TEX(r, pass->desc.depth.texture)) {
pxl8_error("draw: invalid depth texture");
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
return;
}
texture_slot* color_tex = &r->textures[SLOT_INDEX(pass->desc.color.texture.id)];
texture_slot* depth_tex = &r->textures[SLOT_INDEX(pass->desc.depth.texture.id)];
u8* fb = color_tex->data;
u16* zb = depth_tex->data;
u32 fb_w = color_tex->width;
u32 fb_h = color_tex->height;
if (r->viewport_w == 0 || r->viewport_h == 0) {
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
return;
}
i32 vp_x = r->viewport_x;
i32 vp_y = r->viewport_y;
u32 vp_w = r->viewport_w;
u32 vp_h = r->viewport_h;
i32 clip_min_x = vp_x;
i32 clip_min_y = vp_y;
i32 clip_max_x = vp_x + (i32)vp_w - 1;
i32 clip_max_y = vp_y + (i32)vp_h - 1;
if (r->scissor_w > 0 && r->scissor_h > 0) {
i32 sc_min_x = r->scissor_x;
i32 sc_min_y = r->scissor_y;
i32 sc_max_x = r->scissor_x + (i32)r->scissor_w - 1;
i32 sc_max_y = r->scissor_y + (i32)r->scissor_h - 1;
if (sc_min_x > clip_min_x) clip_min_x = sc_min_x;
if (sc_min_y > clip_min_y) clip_min_y = sc_min_y;
if (sc_max_x < clip_max_x) clip_max_x = sc_max_x;
if (sc_max_y < clip_max_y) clip_max_y = sc_max_y;
}
if (clip_min_x < 0) clip_min_x = 0;
if (clip_min_y < 0) clip_min_y = 0;
if (clip_max_x >= (i32)fb_w) clip_max_x = (i32)fb_w - 1;
if (clip_max_y >= (i32)fb_h) clip_max_y = (i32)fb_h - 1;
if (clip_min_x > clip_max_x || clip_min_y > clip_max_y) {
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
return;
}
const pxl8_vertex* vertices = vb->data;
const u16* indices = use_indices ? ib->data : NULL;
pxl8_mat4 mv = pxl8_mat4_multiply(r->current_draw_params.view, r->current_draw_params.model);
pxl8_mat4 mvp = pxl8_mat4_multiply(r->current_draw_params.projection, mv);
f32 near = 0.1f;
pxl8_shader_fn shader = pip->desc.shader;
if (!shader) {
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
return;
}
pxl8_shader_bindings shader_bindings = {0};
pxl8_shader_uniforms shader_uniforms = r->current_draw_params.shader;
shader_uniforms.dither = pip->desc.dither;
shader_uniforms.emissive = pip->desc.emissive;
if (VALID_BINDINGS(r, r->current_bindings)) {
bindings_slot* bnd = &r->bindings[SLOT_INDEX(r->current_bindings.id)];
shader_bindings.colormap = (const u8*)bnd->desc.colormap;
shader_bindings.palette = bnd->desc.palette;
const pxl8_atlas* atlas = bnd->desc.atlas;
if (atlas && bnd->desc.texture_id != UINT32_MAX) {
const pxl8_atlas_entry* entry = pxl8_atlas_get_entry(atlas, bnd->desc.texture_id);
if (entry && entry->active) {
shader_bindings.atlas = pxl8_atlas_get_pixels_tiled(atlas);
shader_bindings.width = (u32)entry->w;
shader_bindings.height = (u32)entry->h;
shader_bindings.use_tiled = true;
shader_bindings.tiled.base = entry->tiled_base;
shader_bindings.tiled.log2_w = entry->log2_w;
}
}
}
bool double_sided = pip->desc.double_sided;
pxl8_gfx_cull_mode cull_mode = pip->desc.rasterizer.cull;
bool is_wireframe = pip->desc.rasterizer.fill == PXL8_GFX_FILL_WIREFRAME;
for (u32 i = cmd->first_index; i < cmd->first_index + cmd->index_count; i += 3) {
STATS_INC(&r->stats, triangles, 1);
u16 i0, i1, i2;
if (use_indices) {
if (i + 2 >= ib->size / sizeof(u16)) break;
i0 = indices[i] + cmd->base_vertex;
i1 = indices[i + 1] + cmd->base_vertex;
i2 = indices[i + 2] + cmd->base_vertex;
} else {
i0 = (u16)(i + cmd->base_vertex);
i1 = (u16)(i + 1 + cmd->base_vertex);
i2 = (u16)(i + 2 + cmd->base_vertex);
}
if (i0 >= vb->size / sizeof(pxl8_vertex)) continue;
if (i1 >= vb->size / sizeof(pxl8_vertex)) continue;
if (i2 >= vb->size / sizeof(pxl8_vertex)) continue;
const pxl8_vertex* v0 = &vertices[i0];
const pxl8_vertex* v1 = &vertices[i1];
const pxl8_vertex* v2 = &vertices[i2];
raster_vertex rv0, rv1, rv2;
pxl8_vec4 p0 = {v0->position.x, v0->position.y, v0->position.z, 1.0f};
pxl8_vec4 p1 = {v1->position.x, v1->position.y, v1->position.z, 1.0f};
pxl8_vec4 p2 = {v2->position.x, v2->position.y, v2->position.z, 1.0f};
rv0.clip_pos = pxl8_mat4_multiply_vec4(mvp, p0);
rv1.clip_pos = pxl8_mat4_multiply_vec4(mvp, p1);
rv2.clip_pos = pxl8_mat4_multiply_vec4(mvp, p2);
if (!is_wireframe) {
pxl8_vec4 w0 = pxl8_mat4_multiply_vec4(r->current_draw_params.model, p0);
pxl8_vec4 w1 = pxl8_mat4_multiply_vec4(r->current_draw_params.model, p1);
pxl8_vec4 w2 = pxl8_mat4_multiply_vec4(r->current_draw_params.model, p2);
rv0.world_pos = (pxl8_vec3){w0.x, w0.y, w0.z};
rv1.world_pos = (pxl8_vec3){w1.x, w1.y, w1.z};
rv2.world_pos = (pxl8_vec3){w2.x, w2.y, w2.z};
rv0.normal = pxl8_vec3_normalize(pxl8_mat4_multiply_vec3(r->current_draw_params.model, v0->normal));
rv1.normal = pxl8_vec3_normalize(pxl8_mat4_multiply_vec3(r->current_draw_params.model, v1->normal));
rv2.normal = pxl8_vec3_normalize(pxl8_mat4_multiply_vec3(r->current_draw_params.model, v2->normal));
rv0.u = v0->u; rv0.v = v0->v;
rv1.u = v1->u; rv1.v = v1->v;
rv2.u = v2->u; rv2.v = v2->v;
rv0.color = v0->color;
rv1.color = v1->color;
rv2.color = v2->color;
rv0.light = v0->light;
rv1.light = v1->light;
rv2.light = v2->light;
} else {
rv0.world_pos = (pxl8_vec3){0, 0, 0};
rv1.world_pos = (pxl8_vec3){0, 0, 0};
rv2.world_pos = (pxl8_vec3){0, 0, 0};
rv0.normal = (pxl8_vec3){0, 0, 0};
rv1.normal = (pxl8_vec3){0, 0, 0};
rv2.normal = (pxl8_vec3){0, 0, 0};
rv0.u = 0.0f; rv0.v = 0.0f;
rv1.u = 0.0f; rv1.v = 0.0f;
rv2.u = 0.0f; rv2.v = 0.0f;
rv0.color = 0; rv1.color = 0; rv2.color = 0;
rv0.light = 0; rv1.light = 0; rv2.light = 0;
}
raster_vertex clipped[6];
i32 clipped_count = clip_triangle_near(&rv0, &rv1, &rv2, near, clipped);
for (i32 t = 0; t < clipped_count; t += 3) {
STATS_INC(&r->stats, clipped_triangles, 1);
if (is_wireframe) {
f32 hw = (f32)vp_w * 0.5f;
f32 hh = (f32)vp_h * 0.5f;
raster_vertex* wv0 = &clipped[t];
raster_vertex* wv1 = &clipped[t+1];
raster_vertex* wv2 = &clipped[t+2];
i32 sx0 = (i32)((f32)vp_x + hw + wv0->clip_pos.x / wv0->clip_pos.w * hw);
i32 sy0 = (i32)((f32)vp_y + hh - wv0->clip_pos.y / wv0->clip_pos.w * hh);
i32 sx1 = (i32)((f32)vp_x + hw + wv1->clip_pos.x / wv1->clip_pos.w * hw);
i32 sy1 = (i32)((f32)vp_y + hh - wv1->clip_pos.y / wv1->clip_pos.w * hh);
i32 sx2 = (i32)((f32)vp_x + hw + wv2->clip_pos.x / wv2->clip_pos.w * hw);
i32 sy2 = (i32)((f32)vp_y + hh - wv2->clip_pos.y / wv2->clip_pos.w * hh);
f32 cross = (f32)(sx1 - sx0) * (f32)(sy2 - sy0) - (f32)(sy1 - sy0) * (f32)(sx2 - sx0);
if (!double_sided) {
if (cull_mode == PXL8_GFX_CULL_BACK && cross >= 0.0f) continue;
if (cull_mode == PXL8_GFX_CULL_FRONT && cross <= 0.0f) continue;
}
u8 wire_color = v0->color ? v0->color : 15;
draw_line_clipped(fb, fb_w, fb_h, sx0, sy0, sx1, sy1, wire_color,
clip_min_x, clip_min_y, clip_max_x, clip_max_y, &r->stats);
draw_line_clipped(fb, fb_w, fb_h, sx1, sy1, sx2, sy2, wire_color,
clip_min_x, clip_min_y, clip_max_x, clip_max_y, &r->stats);
draw_line_clipped(fb, fb_w, fb_h, sx2, sy2, sx0, sy0, wire_color,
clip_min_x, clip_min_y, clip_max_x, clip_max_y, &r->stats);
} else {
tri_setup setup;
if (!setup_tri(&setup, &clipped[t], &clipped[t+1], &clipped[t+2],
vp_x, vp_y, vp_w, vp_h,
clip_min_x, clip_min_y, clip_max_x, clip_max_y,
cull_mode, double_sided)) {
continue;
}
u64 raster_start = STATS_START();
rasterize_triangle(&setup, fb, zb, fb_w, shader, &pip->desc,
2026-02-02 17:48:25 -06:00
&shader_bindings, &shader_uniforms, &r->stats);
STATS_ADD(&r->stats, raster_ns, raster_start);
}
}
}
STATS_ADD(&r->stats, execute_draw_ns, exec_start);
}
void pxl8_gfx_submit(pxl8_renderer* r, pxl8_gfx_cmdbuf* cb) {
u64 submit_start = STATS_START();
for (u32 i = 0; i < cb->count; i++) {
pxl8_gfx_cmd* cmd = &cb->commands[i];
switch (cmd->type) {
case PXL8_GFX_CMD_BEGIN_PASS:
r->current_pass = cmd->begin_pass.pass;
if (VALID_PASS(r, cmd->begin_pass.pass)) {
pass_slot* p = &r->passes[SLOT_INDEX(cmd->begin_pass.pass.id)];
if (p->desc.color.load == PXL8_GFX_LOAD_CLEAR) {
pxl8_clear(r, p->desc.color.texture, p->desc.color.clear_value);
}
if (p->desc.depth.load == PXL8_GFX_LOAD_CLEAR) {
pxl8_clear_depth(r, p->desc.depth.texture);
}
if (p->desc.light_accum.load == PXL8_GFX_LOAD_CLEAR) {
pxl8_clear_light(r, p->desc.light_accum.texture);
}
}
break;
case PXL8_GFX_CMD_END_PASS:
r->current_pass = (pxl8_gfx_pass){ PXL8_GFX_INVALID_ID };
break;
case PXL8_GFX_CMD_SET_PIPELINE:
r->current_pipeline = cmd->set_pipeline.pipeline;
break;
case PXL8_GFX_CMD_SET_BINDINGS:
r->current_bindings = cmd->set_bindings.bindings;
break;
case PXL8_GFX_CMD_SET_VIEWPORT:
r->viewport_x = cmd->set_viewport.x;
r->viewport_y = cmd->set_viewport.y;
r->viewport_w = cmd->set_viewport.w;
r->viewport_h = cmd->set_viewport.h;
break;
case PXL8_GFX_CMD_SET_SCISSOR:
r->scissor_x = cmd->set_scissor.x;
r->scissor_y = cmd->set_scissor.y;
r->scissor_w = cmd->set_scissor.w;
r->scissor_h = cmd->set_scissor.h;
break;
case PXL8_GFX_CMD_SET_DRAW_PARAMS:
r->current_draw_params = cmd->draw_params;
break;
case PXL8_GFX_CMD_DRAW:
execute_draw(r, &cmd->draw);
break;
case PXL8_GFX_CMD_RESOLVE:
break;
}
}
for (u32 i = 0; i < PXL8_GFX_MAX_BUFFERS; i++) {
if (r->buffers[i].active && r->buffers[i].usage == PXL8_GFX_USAGE_STREAM) {
r->buffers[i].append_pos = 0;
}
}
STATS_ADD(&r->stats, submit_ns, submit_start);
}
void pxl8_clear(pxl8_renderer* r, pxl8_gfx_texture target, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format == PXL8_GFX_FORMAT_INDEXED8) {
memset(s->data, color, s->width * s->height);
}
}
void pxl8_clear_depth(pxl8_renderer* r, pxl8_gfx_texture target) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format == PXL8_GFX_FORMAT_DEPTH16) {
memset(s->data, 0xFF, s->width * s->height * 2);
}
}
void pxl8_clear_light(pxl8_renderer* r, pxl8_gfx_texture target) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format == PXL8_GFX_FORMAT_LIGHT_ACCUM) {
memset(s->data, 0, s->width * s->height * 4);
}
}
void pxl8_draw_pixel(pxl8_renderer* r, pxl8_gfx_texture target, i32 x, i32 y, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (x < 0 || y < 0 || (u32)x >= s->width || (u32)y >= s->height) return;
if (s->format == PXL8_GFX_FORMAT_INDEXED8) {
((u8*)s->data)[y * s->width + x] = color;
}
}
u8 pxl8_get_pixel(pxl8_renderer* r, pxl8_gfx_texture target, i32 x, i32 y) {
if (!VALID_TEX(r, target)) return 0;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (x < 0 || y < 0 || (u32)x >= s->width || (u32)y >= s->height) return 0;
if (s->format == PXL8_GFX_FORMAT_INDEXED8) {
return ((u8*)s->data)[y * s->width + x];
}
return 0;
}
void pxl8_draw_line(pxl8_renderer* r, pxl8_gfx_texture target, i32 x0, i32 y0, i32 x1, i32 y1, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format != PXL8_GFX_FORMAT_INDEXED8) return;
u8* fb = s->data;
i32 w = (i32)s->width;
i32 h = (i32)s->height;
i32 dx = abs(x1 - x0);
i32 dy = -abs(y1 - y0);
i32 sx = x0 < x1 ? 1 : -1;
i32 sy = y0 < y1 ? 1 : -1;
i32 err = dx + dy;
while (true) {
if (x0 >= 0 && x0 < w && y0 >= 0 && y0 < h) {
fb[y0 * w + x0] = color;
}
if (x0 == x1 && y0 == y1) break;
i32 e2 = 2 * err;
if (e2 >= dy) { err += dy; x0 += sx; }
if (e2 <= dx) { err += dx; y0 += sy; }
}
}
void pxl8_draw_rect(pxl8_renderer* r, pxl8_gfx_texture target, i32 x, i32 y, i32 w, i32 h, u8 color) {
pxl8_draw_line(r, target, x, y, x + w - 1, y, color);
pxl8_draw_line(r, target, x + w - 1, y, x + w - 1, y + h - 1, color);
pxl8_draw_line(r, target, x + w - 1, y + h - 1, x, y + h - 1, color);
pxl8_draw_line(r, target, x, y + h - 1, x, y, color);
}
void pxl8_draw_rect_fill(pxl8_renderer* r, pxl8_gfx_texture target, i32 x, i32 y, i32 w, i32 h, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format != PXL8_GFX_FORMAT_INDEXED8) return;
u8* fb = s->data;
i32 tw = (i32)s->width;
i32 th = (i32)s->height;
i32 x0 = x < 0 ? 0 : x;
i32 y0 = y < 0 ? 0 : y;
i32 x1 = x + w > tw ? tw : x + w;
i32 y1 = y + h > th ? th : y + h;
for (i32 py = y0; py < y1; py++) {
memset(&fb[py * tw + x0], color, (size_t)(x1 - x0));
}
}
void pxl8_draw_circle(pxl8_renderer* r, pxl8_gfx_texture target, i32 cx, i32 cy, i32 radius, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format != PXL8_GFX_FORMAT_INDEXED8) return;
u8* fb = s->data;
i32 w = (i32)s->width;
i32 h = (i32)s->height;
i32 px = 0, py = radius;
i32 d = 3 - 2 * radius;
#define PLOT(xx, yy) if ((xx) >= 0 && (xx) < w && (yy) >= 0 && (yy) < h) fb[(yy) * w + (xx)] = color
while (py >= px) {
PLOT(cx + px, cy + py); PLOT(cx - px, cy + py);
PLOT(cx + px, cy - py); PLOT(cx - px, cy - py);
PLOT(cx + py, cy + px); PLOT(cx - py, cy + px);
PLOT(cx + py, cy - px); PLOT(cx - py, cy - px);
px++;
if (d > 0) { py--; d += 4 * (px - py) + 10; }
else { d += 4 * px + 6; }
}
#undef PLOT
}
void pxl8_draw_circle_fill(pxl8_renderer* r, pxl8_gfx_texture target, i32 cx, i32 cy, i32 radius, u8 color) {
if (!VALID_TEX(r, target)) return;
texture_slot* s = &r->textures[SLOT_INDEX(target.id)];
if (s->format != PXL8_GFX_FORMAT_INDEXED8) return;
u8* fb = s->data;
i32 w = (i32)s->width;
i32 h = (i32)s->height;
i32 r2 = radius * radius;
for (i32 y = -radius; y <= radius; y++) {
i32 py = cy + y;
if (py < 0 || py >= h) continue;
i32 hspan = (i32)sqrtf((f32)(r2 - y * y));
i32 x0 = cx - hspan;
i32 x1 = cx + hspan;
if (x0 < 0) x0 = 0;
if (x1 >= w) x1 = w - 1;
if (x0 <= x1) {
memset(&fb[py * w + x0], color, (size_t)(x1 - x0 + 1));
}
}
}
void pxl8_resolve_to_rgba(pxl8_renderer* r, pxl8_gfx_texture color, pxl8_gfx_texture light_accum,
const u32* palette, u32* output) {
if (!VALID_TEX(r, color)) return;
texture_slot* cs = &r->textures[SLOT_INDEX(color.id)];
u8* fb = cs->data;
u32 w = cs->width;
u32 h = cs->height;
u32 total = w * h;
(void)light_accum;
#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON)
pxl8_i32_simd alpha_mask = pxl8_i32_simd_set((i32)0xFF000000);
u32 i = 0;
for (; i + 4 <= total; i += 4) {
pxl8_i32_simd base = pxl8_i32_simd_set4(
(i32)palette[fb[i + 0]], (i32)palette[fb[i + 1]],
(i32)palette[fb[i + 2]], (i32)palette[fb[i + 3]]
);
base = pxl8_i32_simd_or(base, alpha_mask);
pxl8_i32_simd_store((i32*)&output[i], base);
2026-02-02 17:48:25 -06:00
}
for (; i < total; i++) {
output[i] = palette[fb[i]] | 0xFF000000;
2026-02-02 17:48:25 -06:00
}
#else
for (u32 i = 0; i < total; i++) {
output[i] = palette[fb[i]] | 0xFF000000;
}
#endif
2026-02-02 17:48:25 -06:00
}