How does blitting work? How can we stare at CFGs to figure out why auto-vectorization goes wrong? Why is MSVC bad?

These and other murder mysteries in part 4 of "Rendering like it's 1996".

Sharing is caring.

https://marioslab.io/posts/rendering-like-its-1996/blistering-fast-blits/

Rendering like it's 1996 - Image file loading and blitting

Load image files and blit them to the screen.

Seriously though, MSVC is really not good at auto-vectorizing trivial loops. It fails to vectorize both `r96_rect()` and `blit()`, but it generates a pretty good vectorized version of `blit_keyed()`. Which beats `r96_rect()`. WTF

https://www.godbolt.org/z/s6daoGrWY

Compiler Explorer - C

typedef struct r96_image { int32_t width, height; uint32_t *pixels; } r96_image; void r96_rect(r96_image *image, int32_t x1, int32_t y1, int32_t width, int32_t height, uint32_t color) { if (width <= 0) return; if (height <= 0) return; int32_t x2 = x1 + width - 1; int32_t y2 = y1 + height - 1; if (x1 >= image->width) return; if (x2 < 0) return; if (y1 >= image->height) return; if (y2 < 0) return; if (x1 < 0) x1 = 0; if (y1 < 0) y1 = 0; if (x2 >= image->width) x2 = image->width - 1; if (y2 >= image->height) y2 = image->height - 1; int32_t clipped_width = x2 - x1 + 1; int32_t next_row = image->width - clipped_width; uint32_t *pixel = image->pixels + y1 * image->width + x1; for (int y = y1; y <= y2; y++) { for (int i = 0; i < clipped_width; i++) { *pixel++ = color; } pixel += next_row; } } void blit(r96_image *dst, r96_image *src, int x, int y) { int32_t dst_x1 = x; int32_t dst_y1 = y; int32_t dst_x2 = x + src->width - 1; int32_t dst_y2 = y + src->height - 1; int32_t src_x1 = 0; int32_t src_y1 = 0; if (dst_x1 >= dst->width) return; if (dst_x2 < 0) return; if (dst_y1 >= dst->height) return; if (dst_y2 < 0) return; if (dst_x1 < 0) { src_x1 -= dst_x1; dst_x1 = 0; } if (dst_y1 < 0) { src_y1 -= dst_y1; dst_y1 = 0; } if (dst_x2 >= dst->width) dst_x2 = dst->width - 1; if (dst_y2 >= dst->height) dst_y2 = dst->height - 1; int32_t clipped_width = dst_x2 - dst_x1 + 1; int32_t dst_next_row = dst->width - clipped_width; int32_t src_next_row = src->width - clipped_width; uint32_t *dst_pixel = dst->pixels + dst_y1 * dst->width + dst_x1; uint32_t *src_pixel = src->pixels + src_y1 * src->width + src_x1; for (int y = dst_y1; y <= dst_y2; y++) { for (int i = 0; i < clipped_width; i++) { *dst_pixel++ = *src_pixel++; } dst_pixel += dst_next_row; src_pixel += src_next_row; } } void blit_keyed(r96_image *dst, r96_image *src, int x, int y, uint32_t color_key) { int32_t dst_x1 = x; int32_t dst_y1 = y; int32_t dst_x2 = x + src->width - 1; int32_t dst_y2 = y + src->height - 1; int32_t src_x1 = 0; int32_t src_y1 = 0; if (dst_x1 >= dst->width) return; if (dst_x2 < 0) return; if (dst_y1 >= dst->height) return; if (dst_y2 < 0) return; if (dst_x1 < 0) { src_x1 -= dst_x1; dst_x1 = 0; } if (dst_y1 < 0) { src_y1 -= dst_y1; dst_y1 = 0; } if (dst_x2 >= dst->width) dst_x2 = dst->width - 1; if (dst_y2 >= dst->height) dst_y2 = dst->height - 1; int32_t clipped_width = dst_x2 - dst_x1 + 1; int32_t dst_next_row = dst->width - clipped_width; int32_t src_next_row = src->width - clipped_width; uint32_t *dst_pixel = dst->pixels + dst_y1 * dst->width + dst_x1; uint32_t *src_pixel = src->pixels + src_y1 * src->width + src_x1; for (y = dst_y1; y <= dst_y2; y++) { for (int i = 0; i < clipped_width; i++) { uint32_t src_color = *src_pixel; uint32_t dst_color = *dst_pixel; *dst_pixel = src_color != color_key ? src_color : dst_color; src_pixel++; dst_pixel++; } dst_pixel += dst_next_row; src_pixel += src_next_row; } }

It's part 3. MSVC made me so mad, I can't even count gud anymore.