ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha

This commit is contained in:
Ben Avison 2019-10-24 21:17:38 -04:00
parent 2dfe060564
commit 1187b013a5
2 changed files with 114 additions and 4 deletions

View File

@ -422,6 +422,21 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
#endif
#if SDL_ARM_NEON_BLITTERS
void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
static void
BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
{
int32_t width = info->dst_w;
int32_t height = info->dst_h;
uint16_t *dstp = (uint16_t *)info->dst;
int32_t dststride = width + (info->dst_skip >> 1);
uint32_t *srcp = (uint32_t *)info->src;
int32_t srcstride = width + (info->src_skip >> 2);
BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
}
void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
static void
@ -1333,14 +1348,21 @@ SDL_CalculateBlitA(SDL_Surface * surface)
}
case 2:
#if SDL_ARM_SIMD_BLITTERS
#if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00 && df->Gmask == 0x7e0
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
|| (sf->Bmask == 0xff && df->Bmask == 0x1f))
&& SDL_HasARMSIMD())
|| (sf->Bmask == 0xff && df->Bmask == 0x1f)))
{
#if SDL_ARM_NEON_BLITTERS
if (SDL_HasNEON())
return BlitARGBto565PixelAlphaARMNEON;
#endif
#if SDL_ARM_SIMD_BLITTERS
if (SDL_HasARMSIMD())
return BlitARGBto565PixelAlphaARMSIMD;
else
#endif
}
#endif
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00

View File

@ -157,3 +157,91 @@ generate_composite_function \
RGBtoRGBPixelAlpha_process_pixblock_head, \
RGBtoRGBPixelAlpha_process_pixblock_tail, \
RGBtoRGBPixelAlpha_process_pixblock_tail_head
/******************************************************************************/
.macro ARGBto565PixelAlpha_process_pixblock_head
vmvn d6, d3
vshr.u8 d1, #2
vshr.u8 d3, #3
vshr.u8 d0, #3
vshrn.u16 d7, q2, #3
vshrn.u16 d25, q2, #8
vbic.i16 q2, #0xe0
vshr.u8 d6, #3
vshr.u8 d7, #2
vshr.u8 d2, #3
vmovn.u16 d24, q2
vshr.u8 d25, #3
vmull.u8 q13, d1, d3
vmlal.u8 q13, d7, d6
vmull.u8 q14, d0, d3
vmlal.u8 q14, d24, d6
vmull.u8 q15, d2, d3
vmlal.u8 q15, d25, d6
.endm
.macro ARGBto565PixelAlpha_process_pixblock_tail
vsra.u16 q13, #5
vsra.u16 q14, #5
vsra.u16 q15, #5
vrshr.u16 q13, #5
vrshr.u16 q14, #5
vrshr.u16 q15, #5
vsli.u16 q14, q13, #5
vsli.u16 q14, q15, #11
.endm
.macro ARGBto565PixelAlpha_process_pixblock_tail_head
vld4.8 {d0-d3}, [SRC]!
PF add PF_X, PF_X, #8
vsra.u16 q13, #5
PF tst PF_CTL, #0xF
vsra.u16 q14, #5
PF addne PF_X, PF_X, #8
vsra.u16 q15, #5
PF subne PF_CTL, PF_CTL, #1
vrshr.u16 q13, #5
PF cmp PF_X, ORIG_W
vrshr.u16 q14, #5
PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
vrshr.u16 q15, #5
PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
vld1.8 {d4-d5}, [DST_R]!
PF subge PF_X, PF_X, ORIG_W
vsli.u16 q14, q13, #5
PF subges PF_CTL, PF_CTL, #0x10
vsli.u16 q14, q15, #11
PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
vst1.8 {q14}, [DST_W :128]!
vmvn d6, d3
vshr.u8 d1, #2
vshr.u8 d3, #3
vshr.u8 d0, #3
vshrn.u16 d7, q2, #3
vshrn.u16 d25, q2, #8
vbic.i16 q2, #0xe0
PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
vshr.u8 d6, #3
vshr.u8 d7, #2
vshr.u8 d2, #3
vmovn.u16 d24, q2
vshr.u8 d25, #3
vmull.u8 q13, d1, d3
vmlal.u8 q13, d7, d6
vmull.u8 q14, d0, d3
vmlal.u8 q14, d24, d6
vmull.u8 q15, d2, d3
vmlal.u8 q15, d25, d6
.endm
generate_composite_function \
BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
8, /* number of pixels, processed in a single block */ \
6, /* prefetch distance */ \
default_init, \
default_cleanup, \
ARGBto565PixelAlpha_process_pixblock_head, \
ARGBto565PixelAlpha_process_pixblock_tail, \
ARGBto565PixelAlpha_process_pixblock_tail_head