ARM: NEON assembly optimization for SDL_FillRect

2025-04-03 12:32:45 +02:00 · 2019-10-24 21:17:52 -04:00 · 2019-10-24 21:17:52 -04:00 · 72f8044a42
commit 72f8044a42
parent 1187b013a5
2 changed files with 149 additions and 0 deletions
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
        return SDL_SetError("SDL_FillRects() passed NULL rects");
    }
 #if SDL_ARM_NEON_BLITTERS
    if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
        void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
        void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
        void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
        switch (dst->format->BytesPerPixel) {
        case 1:
            FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
            break;
        case 2:
            FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
            break;
        case 4:
            FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
            break;
        }
        SDL_UnlockSurface(dst);
        return(0);
    }
 #endif
 #if SDL_ARM_SIMD_BLITTERS
    if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
        void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@ -95,6 +95,134 @@
 /******************************************************************************/
 /* We can actually do significantly better than the Pixman macros, at least for
 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
 */
 .macro generate_fillrect_function name, bpp, log2Bpp
 /*
 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 * On entry:
 * a1 = width, pixels
 * a2 = height, rows
 * a3 = pointer to top-left destination pixel
 * a4 = stride, pixels
 * [sp] = pixel value to fill with
 * Within the function:
 * v1 = width remaining
 * v2 = vst offset
 * v3 = alternate pointer
 * ip = data ARM register
 */
 pixman_asm_function name
    vld1.\bpp   {d0[],d1[]}, [sp]
    sub         a4, a1
    vld1.\bpp   {d2[],d3[]}, [sp]
    cmp         a1, #(15+64) >> \log2Bpp
    push        {v1-v3,lr}
    vmov        ip, s0
    blo         51f
    /* Long-row case */
    mov         v2, #64
 1:  mov         v1, a1
    ands        v3, a3, #15
    beq         2f
    /* Leading pixels */
    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
    sub         v1, v1, v3, lsr #\log2Bpp
    rbit        v3, v3
 .if bpp <= 16
 .if bpp == 8
    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
    strneb      ip, [a3], #1
    tst         v3, #1<<30
 .else
    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
 .endif
    strneh      ip, [a3], #2
 .endif
    movs        v3, v3, lsl #3
    vstmcs      a3!, {s0}
    vstmmi      a3!, {d0}
 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
    add         v3, a3, #32
    /* Inner loop */
 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
    subs        v1, v1, #64 >> \log2Bpp
    vst1.\bpp   {q0-q1}, [v3 :128], v2
    bhs         3b
    /* Trailing pixels */
 4:  movs        v1, v1, lsl #27 + \log2Bpp
    bcc         5f
    vst1.\bpp   {q0-q1}, [a3 :128]!
 5:  bpl         6f
    vst1.\bpp   {q0}, [a3 :128]!
 6:  movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
 .if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
 .if bpp == 8
    strmib      ip, [a3], #1
 .endif
 .endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         1b
    pop         {v1-v3,pc}
    /* Short-row case */
 51: movs        v1, a1
 .if bpp == 8
    tst         a3, #3
    beq         53f
 52: subs        v1, v1, #1
    blo         57f
    strb        ip, [a3], #1
    tst         a3, #3
    bne         52b
 .elseif bpp == 16
    tstne       a3, #2
    subne       v1, v1, #1
    strneh      ip, [a3], #2
 .endif
 53: cmp         v1, #32 >> \log2Bpp
    bcc         54f
    vst1.\bpp   {q0-q1}, [a3]!
    sub         v1, v1, #32 >> \log2Bpp
    /* Trailing pixels */
 54: movs        v1, v1, lsl #27 + \log2Bpp
    bcc         55f
    vst1.\bpp   {q0-q1}, [a3]!
 55: bpl         56f
    vst1.\bpp   {q0}, [a3]!
 56: movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
 .if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
 .if bpp == 8
    strmib      ip, [a3], #1
 .endif
 .endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         51b
 57: pop         {v1-v3,pc}
 .endfunc
 .endm
 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
 generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
 /******************************************************************************/
 .macro RGBtoRGBPixelAlpha_process_pixblock_head
    vmvn        d30, d3  /* get inverted source alpha */
    vmov        d31, d7  /* dest alpha is always unchanged */