ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits

This commit is contained in:
Ben Avison 2019-10-24 21:15:21 -04:00
parent 8425d9d5d0
commit 7ac733f025
2 changed files with 66 additions and 2 deletions

View File

@ -41,7 +41,8 @@
enum blit_features {
BLIT_FEATURE_HAS_MMX = 1,
BLIT_FEATURE_HAS_ALTIVEC = 2,
BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4
BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4,
BLIT_FEATURE_HAS_ARM_SIMD = 8
};
#if SDL_ALTIVEC_BLITTERS
@ -931,7 +932,24 @@ GetBlitFeatures(void)
#endif
#else
/* Feature 1 is has-MMX */
#define GetBlitFeatures() (SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0)
#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) | (SDL_HasARMSIMD() ? BLIT_FEATURE_HAS_ARM_SIMD : 0))
#endif
#if SDL_ARM_SIMD_BLITTERS
void Blit_BGR888_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
static void
Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo * info)
{
int32_t width = info->dst_w;
int32_t height = info->dst_h;
uint32_t *dstp = (uint32_t *)info->dst;
int32_t dststride = width + (info->dst_skip >> 2);
uint32_t *srcp = (uint32_t *)info->src;
int32_t srcstride = width + (info->src_skip >> 2);
Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
}
#endif
/* This is now endian dependent */
@ -3269,6 +3287,10 @@ static const struct blit_table normal_blit_4[] = {
/* has-altivec */
{0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0, 0x0000001F,
BLIT_FEATURE_HAS_ALTIVEC, Blit_RGB888_RGB565Altivec, NO_ALPHA},
#endif
#if SDL_ARM_SIMD_BLITTERS
{0x000000FF, 0x0000FF00, 0x00FF0000, 4, 0x00FF0000, 0x0000FF00, 0x000000FF,
BLIT_FEATURE_HAS_ARM_SIMD, Blit_BGR888_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA },
#endif
/* 4->3 with same rgb triplet */
{0x000000FF, 0x0000FF00, 0x00FF0000, 3, 0x000000FF, 0x0000FF00, 0x00FF0000,

View File

@ -363,3 +363,45 @@ generate_composite_function \
nop_macro, /* cleanup */ \
ARGBto565PixelAlpha_process_head, \
ARGBto565PixelAlpha_process_tail
/******************************************************************************/
.macro BGR888toRGB888_1pixel cond, reg, tmp
uxtb16&cond tmp, WK&reg, ror #8
uxtb16&cond WK&reg, WK&reg, ror #16
orr&cond WK&reg, WK&reg, tmp, lsl #8
.endm
.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
uxtb16&cond tmp1, WK&reg1, ror #8
uxtb16&cond WK&reg1, WK&reg1, ror #16
uxtb16&cond tmp2, WK&reg2, ror #8
uxtb16&cond WK&reg2, WK&reg2, ror #16
orr&cond WK&reg1, WK&reg1, tmp1, lsl #8
orr&cond WK&reg2, WK&reg2, tmp2, lsl #8
.endm
.macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
pixld cond, numbytes, firstreg, SRC, unaligned_src
.endm
.macro BGR888toRGB888_process_tail cond, numbytes, firstreg
.if numbytes >= 8
BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
.if numbytes == 16
BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
.endif
.else @ numbytes == 4
BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
.endif
.endm
generate_composite_function \
Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
2, /* prefetch distance */ \
nop_macro, /* init */ \
nop_macro, /* newline */ \
nop_macro, /* cleanup */ \
BGR888toRGB888_process_head, \
BGR888toRGB888_process_tail