From a33d4c0068d9ef3997fbe82b373f1a78f42e90ef Mon Sep 17 00:00:00 2001 From: asrael Date: Wed, 3 Dec 2025 17:42:42 -0600 Subject: [PATCH] branchless sprite blit and zbuffer updates --- src/pxl8_blit.c | 58 ++++++++++++++++++++--- src/pxl8_gfx.c | 120 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 137 insertions(+), 41 deletions(-) diff --git a/src/pxl8_blit.c b/src/pxl8_blit.c index e8bdc88..2bf3d94 100644 --- a/src/pxl8_blit.c +++ b/src/pxl8_blit.c @@ -9,10 +9,29 @@ void pxl8_blit_hicolor(u16* fb, u32 fb_width, const u16* sprite, u32 atlas_width u16* dest_row = dest_base + row * fb_width; const u16* src_row = src_base + row * atlas_width; - for (u32 col = 0; col < w; col++) { - if (src_row[col] != 0) { - dest_row[col] = src_row[col]; + u32 col = 0; + u32 count2 = w / 2; + for (u32 i = 0; i < count2; i++) { + u32 pixels = ((const u32*)src_row)[i]; + if (pixels == 0) { + col += 2; + continue; } + u16 s0 = (u16)(pixels); + u16 s1 = (u16)(pixels >> 16); + u16 d0 = dest_row[col]; + u16 d1 = dest_row[col + 1]; + u16 m0 = (u16)(-(s0 != 0)); + u16 m1 = (u16)(-(s1 != 0)); + dest_row[col] = (s0 & m0) | (d0 & ~m0); + dest_row[col + 1] = (s1 & m1) | (d1 & ~m1); + col += 2; + } + if (w & 1) { + u16 s = src_row[col]; + u16 d = dest_row[col]; + u16 m = (u16)(-(s != 0)); + dest_row[col] = (s & m) | (d & ~m); } } } @@ -26,10 +45,37 @@ void pxl8_blit_indexed(u8* fb, u32 fb_width, const u8* sprite, u32 atlas_width, u8* dest_row = dest_base + row * fb_width; const u8* src_row = src_base + row * atlas_width; - for (u32 col = 0; col < w; col++) { - if (src_row[col] != 0) { - dest_row[col] = src_row[col]; + u32 col = 0; + u32 count4 = w / 4; + for (u32 i = 0; i < count4; i++) { + u32 pixels = ((const u32*)src_row)[i]; + if (pixels == 0) { + col += 4; + continue; } + u8 s0 = (u8)(pixels); + u8 s1 = (u8)(pixels >> 8); + u8 s2 = (u8)(pixels >> 16); + u8 s3 = (u8)(pixels >> 24); + u8 d0 = dest_row[col]; + u8 d1 = dest_row[col + 1]; + u8 d2 = dest_row[col + 2]; + u8 d3 = dest_row[col + 3]; + u8 m0 = (u8)(-(s0 != 0)); + u8 m1 = (u8)(-(s1 != 0)); + u8 m2 = (u8)(-(s2 != 0)); + u8 m3 = (u8)(-(s3 != 0)); + dest_row[col] = (s0 & m0) | (d0 & ~m0); + dest_row[col + 1] = (s1 & m1) | (d1 & ~m1); + dest_row[col + 2] = (s2 & m2) | (d2 & ~m2); + dest_row[col + 3] = (s3 & m3) | (d3 & ~m3); + col += 4; + } + for (; col < w; col++) { + u8 s = src_row[col]; + u8 d = dest_row[col]; + u8 m = (u8)(-(s != 0)); + dest_row[col] = (s & m) | (d & ~m); } } } diff --git a/src/pxl8_gfx.c b/src/pxl8_gfx.c index 8511cfa..c4aba1d 100644 --- a/src/pxl8_gfx.c +++ b/src/pxl8_gfx.c @@ -382,8 +382,14 @@ void pxl8_clear(pxl8_gfx* gfx, u32 color) { if (gfx->pixel_mode == PXL8_PIXEL_HICOLOR) { u16* fb16 = (u16*)gfx->framebuffer; u16 color16 = pxl8_rgba32_to_rgb565(color); - for (i32 i = 0; i < size; i++) { - fb16[i] = color16; + u32 pattern = (u32)color16 | ((u32)color16 << 16); + u32* fb32 = (u32*)fb16; + i32 count2 = size / 2; + for (i32 i = 0; i < count2; i++) { + fb32[i] = pattern; + } + if (size & 1) { + fb16[size - 1] = color16; } } else { memset(gfx->framebuffer, color & 0xFF, size); @@ -466,9 +472,30 @@ void pxl8_rect_fill(pxl8_gfx* gfx, i32 x, i32 y, i32 w, i32 h, u32 color) { i32 x1 = (x + w > gfx->framebuffer_width) ? gfx->framebuffer_width : x + w; i32 y1 = (y + h > gfx->framebuffer_height) ? gfx->framebuffer_height : y + h; - for (i32 py = y0; py < y1; py++) { - for (i32 px = x0; px < x1; px++) { - pxl8_pixel_unchecked(gfx, px, py, color); + i32 rect_w = x1 - x0; + if (rect_w <= 0 || y1 <= y0) return; + + if (gfx->pixel_mode == PXL8_PIXEL_HICOLOR) { + u16* fb16 = (u16*)gfx->framebuffer; + u16 color16 = pxl8_rgba32_to_rgb565(color); + u32 pattern = (u32)color16 | ((u32)color16 << 16); + + for (i32 py = y0; py < y1; py++) { + u16* row = fb16 + py * gfx->framebuffer_width + x0; + i32 count2 = rect_w / 2; + u32* row32 = (u32*)row; + for (i32 i = 0; i < count2; i++) { + row32[i] = pattern; + } + if (rect_w & 1) { + row[rect_w - 1] = color16; + } + } + } else { + u8 color8 = color & 0xFF; + for (i32 py = y0; py < y1; py++) { + u8* row = gfx->framebuffer + py * gfx->framebuffer_width + x0; + memset(row, color8, rect_w); } } } @@ -617,15 +644,15 @@ void pxl8_sprite(pxl8_gfx* gfx, u32 sprite_id, i32 x, i32 y, i32 w, i32 h) { i32 dest_idx = (dest_y + py) * gfx->framebuffer_width + (dest_x + px); if (gfx->pixel_mode == PXL8_PIXEL_HICOLOR) { - u16 pixel = ((const u16*)atlas_pixels)[src_idx]; - if (pixel != 0) { - ((u16*)gfx->framebuffer)[dest_idx] = pixel; - } + u16 s = ((const u16*)atlas_pixels)[src_idx]; + u16 d = ((u16*)gfx->framebuffer)[dest_idx]; + u16 m = (u16)(-(s != 0)); + ((u16*)gfx->framebuffer)[dest_idx] = (s & m) | (d & ~m); } else { - u8 pixel = atlas_pixels[src_idx]; - if (pixel != 0) { - gfx->framebuffer[dest_idx] = pixel; - } + u8 s = atlas_pixels[src_idx]; + u8 d = gfx->framebuffer[dest_idx]; + u8 m = (u8)(-(s != 0)); + gfx->framebuffer[dest_idx] = (s & m) | (d & ~m); } } } @@ -802,9 +829,8 @@ void pxl8_3d_clear_zbuffer(pxl8_gfx* gfx) { i32 count = gfx->zbuffer_width * gfx->zbuffer_height; const f32 far_z = 1e30f; - f32* ptr = gfx->zbuffer; for (i32 i = 0; i < count; i++) { - ptr[i] = far_z; + gfx->zbuffer[i] = far_z; } } @@ -1024,17 +1050,29 @@ static inline void pxl8_fill_scanline_textured( i32 atlas_idx = (atlas_y_base + ty) * atlas_width + (atlas_x_base + tx); if (is_hicolor) { - u16 color = ((const u16*)atlas_pixels)[atlas_idx]; - if (color != 0) { - gfx->zbuffer[idx] = z0; - ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + xs] = color; - } + u16 s = ((const u16*)atlas_pixels)[atlas_idx]; + u16 d = ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + xs]; + u16 m = (u16)(-(s != 0)); + ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + xs] = (s & m) | (d & ~m); + f32 old_z = gfx->zbuffer[idx]; + u32 zm = -(s != 0); + u32 z0_bits, old_z_bits; + memcpy(&z0_bits, &z0, sizeof(u32)); + memcpy(&old_z_bits, &old_z, sizeof(u32)); + u32 new_z_bits = (z0_bits & zm) | (old_z_bits & ~zm); + memcpy(&gfx->zbuffer[idx], &new_z_bits, sizeof(f32)); } else { - u8 color = atlas_pixels[atlas_idx]; - if (color != 0) { - gfx->zbuffer[idx] = z0; - gfx->framebuffer[y * gfx->framebuffer_width + xs] = color; - } + u8 s = atlas_pixels[atlas_idx]; + u8 d = gfx->framebuffer[y * gfx->framebuffer_width + xs]; + u8 m = (u8)(-(s != 0)); + gfx->framebuffer[y * gfx->framebuffer_width + xs] = (s & m) | (d & ~m); + f32 old_z = gfx->zbuffer[idx]; + u32 zm = -(s != 0); + u32 z0_bits, old_z_bits; + memcpy(&z0_bits, &z0, sizeof(u32)); + memcpy(&old_z_bits, &old_z, sizeof(u32)); + u32 new_z_bits = (z0_bits & zm) | (old_z_bits & ~zm); + memcpy(&gfx->zbuffer[idx], &new_z_bits, sizeof(f32)); } } } @@ -1080,17 +1118,29 @@ static inline void pxl8_fill_scanline_textured( i32 atlas_idx = (atlas_y_base + ty) * atlas_width + (atlas_x_base + tx); if (is_hicolor) { - u16 color = ((const u16*)atlas_pixels)[atlas_idx]; - if (color != 0) { - gfx->zbuffer[idx] = z; - ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + x] = color; - } + u16 s = ((const u16*)atlas_pixels)[atlas_idx]; + u16 d = ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + x]; + u16 m = (u16)(-(s != 0)); + ((u16*)gfx->framebuffer)[y * gfx->framebuffer_width + x] = (s & m) | (d & ~m); + f32 old_z = gfx->zbuffer[idx]; + u32 zm = -(s != 0); + u32 z_bits, old_z_bits; + memcpy(&z_bits, &z, sizeof(u32)); + memcpy(&old_z_bits, &old_z, sizeof(u32)); + u32 new_z_bits = (z_bits & zm) | (old_z_bits & ~zm); + memcpy(&gfx->zbuffer[idx], &new_z_bits, sizeof(f32)); } else { - u8 color = atlas_pixels[atlas_idx]; - if (color != 0) { - gfx->zbuffer[idx] = z; - gfx->framebuffer[y * gfx->framebuffer_width + x] = color; - } + u8 s = atlas_pixels[atlas_idx]; + u8 d = gfx->framebuffer[y * gfx->framebuffer_width + x]; + u8 m = (u8)(-(s != 0)); + gfx->framebuffer[y * gfx->framebuffer_width + x] = (s & m) | (d & ~m); + f32 old_z = gfx->zbuffer[idx]; + u32 zm = -(s != 0); + u32 z_bits, old_z_bits; + memcpy(&z_bits, &z, sizeof(u32)); + memcpy(&old_z_bits, &old_z, sizeof(u32)); + u32 new_z_bits = (z_bits & zm) | (old_z_bits & ~zm); + memcpy(&gfx->zbuffer[idx], &new_z_bits, sizeof(f32)); } } }