/* * Copyright (c) 2016 RISC OS Open Ltd * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ /* Prevent the stack from becoming executable */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif .text .arch armv6 .object_arch armv4 .arm .altmacro .p2align 2 #include "pixman-arm-asm.h" #include "pixman-arm-simd-asm.h" /* A head macro should do all processing which results in an output of up to * 16 bytes, as far as the final load instruction. The corresponding tail macro * should complete the processing of the up-to-16 bytes. The calling macro will * sometimes choose to insert a preload or a decrement of X between them. * cond ARM condition code for code block * numbytes Number of output bytes that should be generated this time * firstreg First WK register in which to place output * unaligned_src Whether to use non-wordaligned loads of source image * unaligned_mask Whether to use non-wordaligned loads of mask image * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output */ /******************************************************************************/ /* This differs from the over_8888_8888 routine in Pixman in that the destination * alpha component is always left unchanged, and RGB components are not * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that * renormalisation is done by multiplying by 257/256 (with rounding) rather than * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. */ .macro RGBtoRGBPixelAlpha_init line_saved_regs STRIDE_S, ORIG_W mov MASK, #0x80 .endm .macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half uxtb tmp3, s uxtb tmp0, d sub tmp0, tmp3, tmp0 uxtb tmp3, s, ror #16 uxtb tmp1, d, ror #16 sub tmp1, tmp3, tmp1 uxtb tmp3, s, ror #8 mov s, s, lsr #24 uxtb tmp2, d, ror #8 sub tmp2, tmp3, tmp2 smlabb tmp0, tmp0, s, half smlabb tmp1, tmp1, s, half smlabb tmp2, tmp2, s, half add tmp0, tmp0, asr #8 add tmp1, tmp1, asr #8 add tmp2, tmp2, asr #8 pkhbt tmp0, tmp0, tmp1, lsl #16 and tmp2, tmp2, #0xff00 uxtb16 tmp0, tmp0, ror #8 orr tmp0, tmp0, tmp2 uadd8 d, d, tmp0 .endm .macro RGBtoRGBPixelAlpha_1pixel_opaque s, d and d, d, #0xff000000 bic s, s, #0xff000000 orr d, d, s .endm .macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload .if numbytes == 16 ldm SRC!, {WK0, WK1} ldm SRC!, {STRIDE_S, STRIDE_M} ldrd WK2, WK3, [DST], #16 orr SCRATCH, WK0, WK1 and ORIG_W, WK0, WK1 orr SCRATCH, SCRATCH, STRIDE_S and ORIG_W, ORIG_W, STRIDE_S orr SCRATCH, SCRATCH, STRIDE_M and ORIG_W, ORIG_W, STRIDE_M tst SCRATCH, #0xff000000 .elseif numbytes == 8 ldm SRC!, {WK0, WK1} ldm DST!, {WK2, WK3} orr SCRATCH, WK0, WK1 and ORIG_W, WK0, WK1 tst SCRATCH, #0xff000000 .else // numbytes == 4 ldr WK0, [SRC], #4 ldr WK2, [DST], #4 tst WK0, #0xff000000 .endif .endm .macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg beq 20f @ all transparent .if numbytes == 16 cmp ORIG_W, #0xff000000 bhs 10f @ all opaque RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK strd WK2, WK3, [DST, #-16] ldrd WK0, WK1, [SRC, #-8] ldrd WK2, WK3, [DST, #-8] RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK b 19f 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 strd WK2, WK3, [DST, #-16] ldrd WK0, WK1, [SRC, #-8] ldrd WK2, WK3, [DST, #-8] RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 19: strd WK2, WK3, [DST, #-8] .elseif numbytes == 8 cmp ORIG_W, #0xff000000 bhs 10f @ all opaque RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK b 19f 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 19: strd WK2, WK3, [DST, #-8] .else // numbytes == 4 cmp WK0, #0xff000000 bhs 10f @ opaque RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK b 19f 10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 19: str WK2, [DST, #-4] .endif 20: .endm generate_composite_function \ BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 2, /* prefetch distance */ \ RGBtoRGBPixelAlpha_init, \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ RGBtoRGBPixelAlpha_process_head, \ RGBtoRGBPixelAlpha_process_tail /******************************************************************************/ .macro ARGBto565PixelAlpha_init line_saved_regs STRIDE_D, STRIDE_S, ORIG_W mov MASK, #0x001f mov STRIDE_M, #0x0010 orr MASK, MASK, MASK, lsl #16 orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16 .endm .macro ARGBto565PixelAlpha_newline mov STRIDE_S, #0x0200 .endm /* On entry: * s1 holds 1 32bpp source pixel * d holds 1 16bpp destination pixel * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively * other registers are temporaries * On exit: * Constant registers preserved */ .macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc mov alpha, s, lsr #27 and misc, s, #0xfc00 and g, d, #0x07e0 pkhbt rb, d, d, lsl #5 rsb misc, g, misc, lsr #5 and s, rbmask, s, lsr #3 and rb, rbmask, rb sub s, s, rb smlabb misc, misc, alpha, ghalf mla s, s, alpha, rbhalf add misc, misc, misc, lsl #5 add g, g, misc, asr #10 add s, s, s, lsl #5 and g, g, #0x07e0 add rb, rb, s, asr #10 and rb, rb, rbmask pkhbt rb, rb, rb, lsl #11 orr d, rb, g orr d, d, rb, lsr #16 .endm /* On entry: * s1 holds 1 32bpp source pixel * d holds 1 16bpp destination pixel * rbmask holds 0x001f001f * On exit: * Constant registers preserved */ .macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask and d, rbmask, s, lsr #3 and s, s, #0xfc00 orr d, d, d, lsr #5 orr d, d, s, lsr #5 .endm /* On entry: * s1, s2 hold 2 32bpp source pixels * d holds 2 16bpp destination pixels * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively * other registers are temporaries * On exit: * Constant registers preserved * Blended results have been written through destination pointer */ .macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc mov alpha, s1, lsr #27 and misc, s1, #0xfc00 and g, d, #0x07e0 pkhbt rb, d, d, lsl #5 rsb misc, g, misc, lsr #5 and s1, rbmask, s1, lsr #3 and rb, rbmask, rb sub s1, s1, rb smlabb misc, misc, alpha, ghalf mla s1, s1, alpha, rbhalf uxth d, d, ror #16 add misc, misc, misc, lsl #5 mov alpha, s2, lsr #27 add g, g, misc, asr #10 add s1, s1, s1, lsl #5 and g, g, #0x07e0 add rb, rb, s1, asr #10 and rb, rb, rbmask and misc, s2, #0xfc00 pkhbt rb, rb, rb, lsl #11 and s1, d, #0x07e0 pkhbt d, d, d, lsl #5 rsb misc, s1, misc, lsr #5 and s2, rbmask, s2, lsr #3 and d, rbmask, d sub s2, s2, d smlabb misc, misc, alpha, ghalf mla s2, s2, alpha, rbhalf orr alpha, rb, g add misc, misc, misc, lsl #5 orr alpha, alpha, rb, lsr #16 add s1, s1, misc, asr #10 add s2, s2, s2, lsl #5 and s1, s1, #0x07e0 add d, d, s2, asr #10 and d, d, rbmask strh alpha, [DST, #-4] pkhbt d, d, d, lsl #11 orr alpha, d, s1 orr alpha, alpha, d, lsr #16 strh alpha, [DST, #-2] .endm /* On entry: * s1, s2 hold 2 32bpp source pixels * rbmask holds 0x001f001f * other registers are temporaries * On exit: * Constant registers preserved * Blended results have been written through destination pointer */ .macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g and g, s1, #0xfc00 and d, rbmask, s1, lsr #3 and s1, rbmask, s2, lsr #3 orr d, d, d, lsr #5 orr d, d, g, lsr #5 and g, s2, #0xfc00 strh d, [DST, #-4] orr s1, s1, s1, lsr #5 orr s1, s1, g, lsr #5 strh s1, [DST, #-2] .endm .macro ARGBto565PixelAlpha_2pixels_head ldrd WK0, WK1, [SRC], #8 ldr WK2, [DST], #4 orr SCRATCH, WK0, WK1 and ORIG_W, WK0, WK1 tst SCRATCH, #0xff000000 .endm .macro ARGBto565PixelAlpha_2pixels_tail beq 20f @ all transparent cmp ORIG_W, #0xff000000 bhs 10f @ all opaque ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W b 20f 10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH 20: .endm .macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload .if numbytes == 16 ARGBto565PixelAlpha_2pixels_head ARGBto565PixelAlpha_2pixels_tail ARGBto565PixelAlpha_2pixels_head ARGBto565PixelAlpha_2pixels_tail .endif .if numbytes >= 8 ARGBto565PixelAlpha_2pixels_head ARGBto565PixelAlpha_2pixels_tail .endif .if numbytes >= 4 ARGBto565PixelAlpha_2pixels_head .else // numbytes == 2 ldr WK0, [SRC], #4 ldrh WK2, [DST], #2 tst WK0, #0xff000000 .endif .endm .macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg .if numbytes >= 4 ARGBto565PixelAlpha_2pixels_tail .else // numbytes == 2 beq 20f @ all transparent cmp WK0, #0xff000000 bhs 10f @ opaque ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W b 19f 10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK 19: strh WK2, [DST, #-2] 20: .endif .endm generate_composite_function \ BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ 2, /* prefetch distance */ \ ARGBto565PixelAlpha_init, \ ARGBto565PixelAlpha_newline, \ nop_macro, /* cleanup */ \ ARGBto565PixelAlpha_process_head, \ ARGBto565PixelAlpha_process_tail /******************************************************************************/ .macro BGR888toRGB888_1pixel cond, reg, tmp uxtb16&cond tmp, WK®, ror #8 uxtb16&cond WK®, WK®, ror #16 orr&cond WK®, WK®, tmp, lsl #8 .endm .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 uxtb16&cond tmp1, WK®1, ror #8 uxtb16&cond WK®1, WK®1, ror #16 uxtb16&cond tmp2, WK®2, ror #8 uxtb16&cond WK®2, WK®2, ror #16 orr&cond WK®1, WK®1, tmp1, lsl #8 orr&cond WK®2, WK®2, tmp2, lsl #8 .endm .macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload pixld cond, numbytes, firstreg, SRC, unaligned_src .endm .macro BGR888toRGB888_process_tail cond, numbytes, firstreg .if numbytes >= 8 BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M .if numbytes == 16 BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M .endif .else @ numbytes == 4 BGR888toRGB888_1pixel cond, %(firstreg+0), MASK .endif .endm generate_composite_function \ Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 2, /* prefetch distance */ \ nop_macro, /* init */ \ nop_macro, /* newline */ \ nop_macro, /* cleanup */ \ BGR888toRGB888_process_head, \ BGR888toRGB888_process_tail