Seriously though, MSVC is really not good at auto-vectorizing trivial loops. It fails to vectorize both `r96_rect()` and `blit()`, but it generates a pretty good vectorized version of `blit_keyed()`. Which beats `r96_rect()`. WTF
https://www.godbolt.org/z/s6daoGrWY

Compiler Explorer - C
typedef struct r96_image {
int32_t width, height;
uint32_t *pixels;
} r96_image;
void r96_rect(r96_image *image, int32_t x1, int32_t y1, int32_t width, int32_t height, uint32_t color) {
if (width <= 0) return;
if (height <= 0) return;
int32_t x2 = x1 + width - 1;
int32_t y2 = y1 + height - 1;
if (x1 >= image->width) return;
if (x2 < 0) return;
if (y1 >= image->height) return;
if (y2 < 0) return;
if (x1 < 0) x1 = 0;
if (y1 < 0) y1 = 0;
if (x2 >= image->width) x2 = image->width - 1;
if (y2 >= image->height) y2 = image->height - 1;
int32_t clipped_width = x2 - x1 + 1;
int32_t next_row = image->width - clipped_width;
uint32_t *pixel = image->pixels + y1 * image->width + x1;
for (int y = y1; y <= y2; y++) {
for (int i = 0; i < clipped_width; i++) {
*pixel++ = color;
}
pixel += next_row;
}
}
void blit(r96_image *dst, r96_image *src, int x, int y) {
int32_t dst_x1 = x;
int32_t dst_y1 = y;
int32_t dst_x2 = x + src->width - 1;
int32_t dst_y2 = y + src->height - 1;
int32_t src_x1 = 0;
int32_t src_y1 = 0;
if (dst_x1 >= dst->width) return;
if (dst_x2 < 0) return;
if (dst_y1 >= dst->height) return;
if (dst_y2 < 0) return;
if (dst_x1 < 0) {
src_x1 -= dst_x1;
dst_x1 = 0;
}
if (dst_y1 < 0) {
src_y1 -= dst_y1;
dst_y1 = 0;
}
if (dst_x2 >= dst->width) dst_x2 = dst->width - 1;
if (dst_y2 >= dst->height) dst_y2 = dst->height - 1;
int32_t clipped_width = dst_x2 - dst_x1 + 1;
int32_t dst_next_row = dst->width - clipped_width;
int32_t src_next_row = src->width - clipped_width;
uint32_t *dst_pixel = dst->pixels + dst_y1 * dst->width + dst_x1;
uint32_t *src_pixel = src->pixels + src_y1 * src->width + src_x1;
for (int y = dst_y1; y <= dst_y2; y++) {
for (int i = 0; i < clipped_width; i++) {
*dst_pixel++ = *src_pixel++;
}
dst_pixel += dst_next_row;
src_pixel += src_next_row;
}
}
void blit_keyed(r96_image *dst, r96_image *src, int x, int y, uint32_t color_key) {
int32_t dst_x1 = x;
int32_t dst_y1 = y;
int32_t dst_x2 = x + src->width - 1;
int32_t dst_y2 = y + src->height - 1;
int32_t src_x1 = 0;
int32_t src_y1 = 0;
if (dst_x1 >= dst->width) return;
if (dst_x2 < 0) return;
if (dst_y1 >= dst->height) return;
if (dst_y2 < 0) return;
if (dst_x1 < 0) {
src_x1 -= dst_x1;
dst_x1 = 0;
}
if (dst_y1 < 0) {
src_y1 -= dst_y1;
dst_y1 = 0;
}
if (dst_x2 >= dst->width) dst_x2 = dst->width - 1;
if (dst_y2 >= dst->height) dst_y2 = dst->height - 1;
int32_t clipped_width = dst_x2 - dst_x1 + 1;
int32_t dst_next_row = dst->width - clipped_width;
int32_t src_next_row = src->width - clipped_width;
uint32_t *dst_pixel = dst->pixels + dst_y1 * dst->width + dst_x1;
uint32_t *src_pixel = src->pixels + src_y1 * src->width + src_x1;
for (y = dst_y1; y <= dst_y2; y++) {
for (int i = 0; i < clipped_width; i++) {
uint32_t src_color = *src_pixel;
uint32_t dst_color = *dst_pixel;
*dst_pixel = src_color != color_key ? src_color : dst_color;
src_pixel++;
dst_pixel++;
}
dst_pixel += dst_next_row;
src_pixel += src_next_row;
}
}