speed up gfx, colored lighting is out for now

2026-02-05 02:42:58 -06:00 · 2026-02-05 02:42:58 -06:00 · 01e6059dd1
commit 01e6059dd1
parent 3c3e961995
17 changed files with 1055 additions and 250 deletions
--- a/src/gfx/pxl8_render.c
+++ b/src/gfx/pxl8_render.c
@ -296,7 +296,6 @@ static void rasterize_triangle(
    const tri_setup* setup,
    u8* fb,
    u16* zb,
-    u32* light_accum,
    u32 fb_width,
    pxl8_shader_fn shader,
    const pxl8_gfx_pipeline_desc* pipeline,
@ -425,7 +424,6 @@ static void rasterize_triangle(
        u32 row_start = (u32)y * fb_width;
        u8* prow = fb + row_start;
        u16* zrow = zb + row_start;
-        u32* lrow = light_accum ? light_accum + row_start : NULL;

        i32 x = x_start;
        while (x <= x_end) {
@ -433,8 +431,8 @@ static void rasterize_triangle(
            if (span_end > x_end) span_end = x_end;
            i32 span_len = span_end - x + 1;

-            f32 pw_start = pxl8_fast_rcp(wr);
-            f32 pw_end = pxl8_fast_rcp(wr + dwr * (f32)span_len);
+            f32 pw_start = 1.0f / wr;
+            f32 pw_end = 1.0f / (wr + dwr * (f32)span_len);

            f32 u_start = uw * pw_start;
            f32 v_start = vw * pw_start;
@ -471,7 +469,80 @@ static void rasterize_triangle(
            f32 wy_a = wy_start;
            f32 wz_a = wz_start;

-            for (i32 px = x; px <= span_end; px++) {
+            i32 px = x;
+
+#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON)
+            if (depth_test && depth_compare == PXL8_GFX_COMPARE_LESS && !blend_enabled) {
+                pxl8_f32_simd dz4_simd = pxl8_f32_simd_set(dz * 4.0f);
+                pxl8_f32_simd half = pxl8_f32_simd_set(0.5f);
+                pxl8_f32_simd one = pxl8_f32_simd_set(1.0f);
+                pxl8_f32_simd zero = pxl8_f32_simd_zero();
+                pxl8_f32_simd scale65535 = pxl8_f32_simd_set(65535.0f);
+                pxl8_f32_simd z4 = pxl8_f32_simd_set4(z_a, z_a + dz, z_a + dz * 2.0f, z_a + dz * 3.0f);
+
+                pxl8_f32_simd offsets = pxl8_f32_simd_set4(0.0f, 1.0f, 2.0f, 3.0f);
+                f32 du4 = du * 4.0f, dv4 = dv * 4.0f, dl4 = dl * 4.0f, dc4 = dc * 4.0f;
+                f32 dz4 = dz * 4.0f, dwx4 = dwx * 4.0f, dwy4 = dwy * 4.0f, dwz4 = dwz * 4.0f;
+
+                for (; px + 3 <= span_end; px += 4) {
+                    pxl8_f32_simd depth_norm = pxl8_f32_simd_clamp(pxl8_f32_simd_mul(pxl8_f32_simd_add(z4, one), half), zero, one);
+                    pxl8_i32_simd z16_4 = pxl8_f32_simd_to_i32(pxl8_f32_simd_mul(depth_norm, scale65535));
+                    pxl8_i32_simd zbuf = pxl8_i32_simd_set4((i32)zrow[px], (i32)zrow[px+1], (i32)zrow[px+2], (i32)zrow[px+3]);
+                    i32 mask = pxl8_i32_simd_movemask(pxl8_i32_simd_cmpgt(zbuf, z16_4));
+
+                    STATS_INC(stats, depth_tests, 4);
+
+                    if (mask) {
+                        pxl8_shader_ctx frag_ctx = {
+                            .color_count = 4,
+                            .x = pxl8_i32_simd_set4(px, px + 1, px + 2, px + 3),
+                            .y = pxl8_i32_simd_set(y),
+                            .v_uv = {
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(u_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(du), offsets)),
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(v_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dv), offsets))
+                            },
+                            .v_world = {
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(wx_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwx), offsets)),
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(wy_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwy), offsets)),
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(wz_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dwz), offsets))
+                            },
+                            .v_normal = pxl8_vec3_simd_set(setup->normal),
+                            .v_light = pxl8_f32_simd_mul(
+                                pxl8_f32_simd_add(pxl8_f32_simd_set(l_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dl), offsets)),
+                                pxl8_f32_simd_set(1.0f / 255.0f)
+                            ),
+                            .v_color = pxl8_f32_simd_add(pxl8_f32_simd_set(c_a), pxl8_f32_simd_mul(pxl8_f32_simd_set(dc), offsets)),
+                            .v_depth = z4,
+                        };
+
+                        u8 colors[4];
+                        shader(&frag_ctx, bindings, uniforms, colors);
+                        STATS_INC(stats, shader_calls, 1);
+
+                        i32 z16_arr[4];
+                        pxl8_i32_simd_store(z16_arr, z16_4);
+
+                        for (i32 i = 0; i < 4; i++) {
+                            if (!(mask & (0x8 << (i * 4)))) continue;
+                            STATS_INC(stats, depth_passes, 1);
+
+                            u8 color = colors[i];
+                            if (!(alpha_test && color <= alpha_ref) && color != 0) {
+                                prow[px + i] = color;
+                                if (depth_write) zrow[px + i] = (u16)z16_arr[i];
+                                STATS_INC(stats, pixels_written, 1);
+                            }
+                        }
+                    }
+
+                    u_a += du4; v_a += dv4; l_a += dl4; c_a += dc4;
+                    z_a += dz4; wx_a += dwx4; wy_a += dwy4; wz_a += dwz4;
+                    z4 = pxl8_f32_simd_add(z4, dz4_simd);
+                }
+            }
+#endif
+
+            for (; px <= span_end; px++) {
                f32 depth_norm = pxl8_clamp((z_a + 1.0f) * 0.5f, 0.0f, 1.0f);
                u16 z16 = (u16)(depth_norm * 65535.0f);

@ -480,18 +551,19 @@ static void rasterize_triangle(
                if (depth_pass) {
                    STATS_INC(stats, depth_passes, 1);
                    pxl8_shader_ctx frag_ctx = {
-                        .x = px,
-                        .y = y,
-                        .v_uv = { { u_a, v_a } },
-                        .v_world = { wx_a, wy_a, wz_a },
-                        .v_normal = setup->normal,
-                        .v_light = l_a / 255.0f,
-                        .v_color = c_a,
-                        .v_depth = z_a,
-                        .out_light_color = 0,
+                        .color_count = 1,
+                        .x = pxl8_i32_simd_set(px),
+                        .y = pxl8_i32_simd_set(y),
+                        .v_uv = { pxl8_f32_simd_set(u_a), pxl8_f32_simd_set(v_a) },
+                        .v_world = { pxl8_f32_simd_set(wx_a), pxl8_f32_simd_set(wy_a), pxl8_f32_simd_set(wz_a) },
+                        .v_normal = pxl8_vec3_simd_set(setup->normal),
+                        .v_light = pxl8_f32_simd_set(l_a / 255.0f),
+                        .v_color = pxl8_f32_simd_set(c_a),
+                        .v_depth = pxl8_f32_simd_set(z_a),
                    };

-                    u8 color = shader(&frag_ctx, bindings, uniforms);
+                    u8 color;
+                    shader(&frag_ctx, bindings, uniforms, &color);
                    STATS_INC(stats, shader_calls, 1);

                    if (!(alpha_test && color <= alpha_ref)) {
@ -506,10 +578,6 @@ static void rasterize_triangle(
                                zrow[px] = z16;
                            }
                            STATS_INC(stats, pixels_written, 1);
-                            if (lrow && frag_ctx.out_light_color != 0) {
-                                lrow[px] = frag_ctx.out_light_color;
-                                STATS_INC(stats, light_writes, 1);
-                            }
                        }
                    }
                }
@ -1080,12 +1148,6 @@ static void execute_draw(
        return;
    }

-    u32* light_accum = NULL;
-    if (VALID_TEX(r, pass->desc.light_accum.texture)) {
-        texture_slot* light_tex = &r->textures[SLOT_INDEX(pass->desc.light_accum.texture.id)];
-        light_accum = light_tex->data;
-    }
-
    const pxl8_vertex* vertices = vb->data;
    const u16* indices = use_indices ? ib->data : NULL;

@ -1239,7 +1301,7 @@ static void execute_draw(
                }

                u64 raster_start = STATS_START();
-                rasterize_triangle(&setup, fb, zb, light_accum, fb_w, shader, &pip->desc,
+                rasterize_triangle(&setup, fb, zb, fb_w, shader, &pip->desc,
                    &shader_bindings, &shader_uniforms, &r->stats);
                STATS_ADD(&r->stats, raster_ns, raster_start);
            }
@ -1462,41 +1524,27 @@ void pxl8_resolve_to_rgba(pxl8_renderer* r, pxl8_gfx_texture color, pxl8_gfx_tex
    u8* fb = cs->data;
    u32 w = cs->width;
    u32 h = cs->height;
+    u32 total = w * h;

-    u32* light_data = NULL;
-    if (light_accum.id != PXL8_GFX_INVALID_ID && VALID_TEX(r, light_accum)) {
-        light_data = r->textures[SLOT_INDEX(light_accum.id)].data;
+    (void)light_accum;
+
+#if defined(PXL8_SIMD_SSE) || defined(PXL8_SIMD_NEON)
+    pxl8_i32_simd alpha_mask = pxl8_i32_simd_set((i32)0xFF000000);
+    u32 i = 0;
+    for (; i + 4 <= total; i += 4) {
+        pxl8_i32_simd base = pxl8_i32_simd_set4(
+            (i32)palette[fb[i + 0]], (i32)palette[fb[i + 1]],
+            (i32)palette[fb[i + 2]], (i32)palette[fb[i + 3]]
+        );
+        base = pxl8_i32_simd_or(base, alpha_mask);
+        pxl8_i32_simd_store((i32*)&output[i], base);
    }
-
-    for (u32 i = 0; i < w * h; i++) {
-        u8 idx = fb[i];
-        u32 base = palette[idx];
-
-        if (light_data) {
-            u32 lv = light_data[i];
-            u32 la = lv >> 24;
-            if (la > 0) {
-                i32 br = base & 0xFF;
-                i32 bg = (base >> 8) & 0xFF;
-                i32 bb = (base >> 16) & 0xFF;
-
-                i32 lr = lv & 0xFF;
-                i32 lg = (lv >> 8) & 0xFF;
-                i32 lb = (lv >> 16) & 0xFF;
-
-                f32 t = (f32)la / 255.0f;
-                br += (i32)((f32)(lr - 128) * t * 2.0f);
-                bg += (i32)((f32)(lg - 128) * t * 2.0f);
-                bb += (i32)((f32)(lb - 128) * t * 2.0f);
-
-                br = pxl8_clamp_byte(br);
-                bg = pxl8_clamp_byte(bg);
-                bb = pxl8_clamp_byte(bb);
-
-                base = (u32)br | ((u32)bg << 8) | ((u32)bb << 16) | 0xFF000000;
-            }
-        }
-
-        output[i] = base | 0xFF000000;
+    for (; i < total; i++) {
+        output[i] = palette[fb[i]] | 0xFF000000;
    }
+#else
+    for (u32 i = 0; i < total; i++) {
+        output[i] = palette[fb[i]] | 0xFF000000;
+    }
+#endif
 }