# HG changeset patch # User mru # Date 1229475294 0 # Node ID 9281a8a9387acfe51c878dbe922906bfcbf4b62c # Parent c30b92cf446b62ad8e635d6c0c145f711585aea0 ARM: replace "armv4l" with "arm" diff -r c30b92cf446b -r 9281a8a9387a Makefile --- a/Makefile Wed Dec 17 00:39:45 2008 +0000 +++ b/Makefile Wed Dec 17 00:54:54 2008 +0000 @@ -429,29 +429,29 @@ alpha/mpegvideo_alpha.o \ alpha/simple_idct_alpha.o \ -OBJS-$(ARCH_ARMV4L) += armv4l/dsputil_arm.o \ - armv4l/dsputil_arm_s.o \ - armv4l/jrevdct_arm.o \ - armv4l/mpegvideo_arm.o \ - armv4l/simple_idct_arm.o \ +OBJS-$(ARCH_ARM) += arm/dsputil_arm.o \ + arm/dsputil_arm_s.o \ + arm/jrevdct_arm.o \ + arm/mpegvideo_arm.o \ + arm/simple_idct_arm.o \ -OBJS-$(HAVE_ARMV5TE) += armv4l/mpegvideo_armv5te.o \ - armv4l/mpegvideo_armv5te_s.o \ - armv4l/simple_idct_armv5te.o \ +OBJS-$(HAVE_ARMV5TE) += arm/mpegvideo_armv5te.o \ + arm/mpegvideo_armv5te_s.o \ + arm/simple_idct_armv5te.o \ -OBJS-$(HAVE_ARMV6) += armv4l/simple_idct_armv6.o \ +OBJS-$(HAVE_ARMV6) += arm/simple_idct_armv6.o \ -OBJS-$(HAVE_ARMVFP) += armv4l/dsputil_vfp.o \ - armv4l/float_arm_vfp.o \ +OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ + arm/float_arm_vfp.o \ -OBJS-$(HAVE_IWMMXT) += armv4l/dsputil_iwmmxt.o \ - armv4l/mpegvideo_iwmmxt.o \ +OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ + arm/mpegvideo_iwmmxt.o \ -OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \ - armv4l/dsputil_neon_s.o \ - armv4l/h264dsp_neon.o \ - armv4l/h264idct_neon.o \ - armv4l/simple_idct_neon.o \ +OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \ + arm/dsputil_neon_s.o \ + arm/h264dsp_neon.o \ + arm/h264idct_neon.o \ + arm/simple_idct_neon.o \ OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ bfin/fdct_bfin.o \ @@ -499,7 +499,7 @@ TESTS-$(ARCH_X86) += i386/cpuid-test$(EXESUF) motion-test$(EXESUF) CLEANFILES = apiexample$(EXESUF) -DIRS = alpha armv4l bfin i386 mlib ppc ps2 sh4 sparc +DIRS = alpha arm bfin i386 mlib ppc ps2 sh4 sparc include $(SUBDIR)../subdir.mak diff -r c30b92cf446b -r 9281a8a9387a arm/asm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/asm.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .macro require8, val=1 + .eabi_attribute 24, \val + .endm + + .macro preserve8, val=1 + .eabi_attribute 25, \val + .endm + + .macro function name, export=0 +.if \export + .global \name +.endif + .type \name, %function + .func \name +\name: + .endm diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_arm.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,217 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#ifdef HAVE_IPP +#include +#endif + +void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); +void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); + +void j_rev_dct_ARM(DCTELEM *data); +void simple_idct_ARM(DCTELEM *data); + +void simple_idct_armv5te(DCTELEM *data); +void simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); +void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_simple_idct_armv6(DCTELEM *data); +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_simple_idct_neon(DCTELEM *data); +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); + +/* XXX: local hack */ +static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); + +void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void ff_prefetch_arm(void *mem, int stride, int h); + +CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8) +CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8) +CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8) + +void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest, + int line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + j_rev_dct_ARM (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + j_rev_dct_ARM (block); + ff_add_pixels_clamped(block, dest, line_size); +} +static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + simple_idct_ARM (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + simple_idct_ARM (block); + ff_add_pixels_clamped(block, dest, line_size); +} + +#ifdef HAVE_IPP +static void simple_idct_ipp(DCTELEM *block) +{ + ippiDCT8x8Inv_Video_16s_C1I(block); +} +static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); +} + +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); + +static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ippiDCT8x8Inv_Video_16s_C1I(block); +#ifdef HAVE_IWMMXT + add_pixels_clamped_iwmmxt(block, dest, line_size); +#else + ff_add_pixels_clamped_ARM(block, dest, line_size); +#endif +} +#endif + +int mm_support(void) +{ + return ENABLE_IWMMXT * FF_MM_IWMMXT; +} + +void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx) +{ + int idct_algo= avctx->idct_algo; + + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + + if (avctx->lowres == 0) { + if(idct_algo == FF_IDCT_AUTO){ +#if defined(HAVE_IPP) + idct_algo = FF_IDCT_IPP; +#elif defined(HAVE_NEON) + idct_algo = FF_IDCT_SIMPLENEON; +#elif defined(HAVE_ARMV6) + idct_algo = FF_IDCT_SIMPLEARMV6; +#elif defined(HAVE_ARMV5TE) + idct_algo = FF_IDCT_SIMPLEARMV5TE; +#else + idct_algo = FF_IDCT_ARM; +#endif + } + + if(idct_algo==FF_IDCT_ARM){ + c->idct_put= j_rev_dct_ARM_put; + c->idct_add= j_rev_dct_ARM_add; + c->idct = j_rev_dct_ARM; + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + } else if (idct_algo==FF_IDCT_SIMPLEARM){ + c->idct_put= simple_idct_ARM_put; + c->idct_add= simple_idct_ARM_add; + c->idct = simple_idct_ARM; + c->idct_permutation_type= FF_NO_IDCT_PERM; +#ifdef HAVE_ARMV6 + } else if (idct_algo==FF_IDCT_SIMPLEARMV6){ + c->idct_put= ff_simple_idct_put_armv6; + c->idct_add= ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; +#endif +#ifdef HAVE_ARMV5TE + } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){ + c->idct_put= simple_idct_put_armv5te; + c->idct_add= simple_idct_add_armv5te; + c->idct = simple_idct_armv5te; + c->idct_permutation_type = FF_NO_IDCT_PERM; +#endif +#ifdef HAVE_IPP + } else if (idct_algo==FF_IDCT_IPP){ + c->idct_put= simple_idct_ipp_put; + c->idct_add= simple_idct_ipp_add; + c->idct = simple_idct_ipp; + c->idct_permutation_type= FF_NO_IDCT_PERM; +#endif +#ifdef HAVE_NEON + } else if (idct_algo==FF_IDCT_SIMPLENEON){ + c->idct_put= ff_simple_idct_put_neon; + c->idct_add= ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; +#endif + } + } + + c->put_pixels_tab[0][0] = put_pixels16_arm; + c->put_pixels_tab[0][1] = put_pixels16_x2_arm; + c->put_pixels_tab[0][2] = put_pixels16_y2_arm; + c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; + c->put_pixels_tab[1][0] = put_pixels8_arm; + c->put_pixels_tab[1][1] = put_pixels8_x2_arm; + c->put_pixels_tab[1][2] = put_pixels8_y2_arm; + c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm; + +#ifdef HAVE_ARMV5TE + c->prefetch = ff_prefetch_arm; +#endif + +#ifdef HAVE_IWMMXT + dsputil_init_iwmmxt(c, avctx); +#endif +#ifdef HAVE_ARMVFP + ff_float_init_arm_vfp(c, avctx); +#endif +#ifdef HAVE_NEON + ff_dsputil_init_neon(c, avctx); +#endif +} diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_arm_s.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_arm_s.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,799 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "asm.S" + + preserve8 + +#ifndef HAVE_PLD +.macro pld reg +.endm +#endif + +#ifdef HAVE_ARMV5TE +function ff_prefetch_arm, export=1 + subs r2, r2, #1 + pld [r0] + add r0, r0, r1 + bne ff_prefetch_arm + bx lr + .endfunc +#endif + +.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +@ ---------------------------------------------------------------- + .align 8 +function put_pixels16_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11, lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + bic r1, r1, #3 + add r5, r5, r4, lsl #2 + ldrne pc, [r5] +1: + ldmia r1, {r4-r7} + add r1, r1, r2 + stmia r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r11, pc} + .align 8 +2: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r11, pc} + .align 8 +3: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r11, pc} + .align 8 +4: + ldmia r1, {r4-r8} + add r1, r1, r2 + ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stmia r0, {r9-r12} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r11,pc} + .align 8 +5: + .word 1b + .word 2b + .word 3b + .word 4b + .endfunc + +@ ---------------------------------------------------------------- + .align 8 +function put_pixels8_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r5,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + bic r1, r1, #3 + add r5, r5, r4, lsl #2 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stmia r0, {r4-r5} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r5,pc} + .align 8 +2: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r5,pc} + .align 8 +3: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r5,pc} + .align 8 +4: + ldmia r1, {r4-r5, r12} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r5,pc} + .align 8 +5: + .word 1b + .word 2b + .word 3b + .word 4b + .endfunc + +@ ---------------------------------------------------------------- + .align 8 +function put_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r10,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r10,pc} + .align 8 +2: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r10,pc} + .align 8 +3: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r10,pc} + .align 8 +4: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + .endfunc + + .align 8 +function put_no_rnd_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r10,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 1b + ldmfd sp!, {r4-r10,pc} + .align 8 +2: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 2b + ldmfd sp!, {r4-r10,pc} + .align 8 +3: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stmia r0, {r4-r5} + add r0, r0, r2 + bne 3b + ldmfd sp!, {r4-r10,pc} + .align 8 +4: + ldmia r1, {r4-r5, r10} + add r1, r1, r2 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 4b + ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + .endfunc + + +@ ---------------------------------------------------------------- + .align 8 +function put_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + mov r3, r3, lsr #1 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 +6: ldmia r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldmia r1, {r4-r5} + add r1, r1, r2 + stmia r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +2: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +3: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +4: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + .endfunc + + .align 8 +function put_no_rnd_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adr r5, 5f + ands r4, r1, #3 + mov r3, r3, lsr #1 + ldr r12, [r5] + add r5, r5, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + ldmia r1, {r4-r5} + add r1, r1, r2 +6: ldmia r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldmia r1, {r4-r5} + add r1, r1, r2 + stmia r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stmia r0, {r8-r9} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +2: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +3: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +4: + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 +6: ldmia r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + ldmia r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stmia r0, {r10-r11} + add r0, r0, r2 + bne 6b + ldmfd sp!, {r4-r11,pc} + .align 8 +5: + .word 0xFEFEFEFE + .word 2b + .word 3b + .word 4b + .endfunc + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldmia r1, {r6-r8} +.elseif \align == 3 + ldmia r1, {r5-r7} +.else + ldmia r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 + ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 + ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, [r12, #0] @ 0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 + ldreq r14, [r12, #16] @ 0x02020202/0x01010101 + add r8, r8, r10 + add r9, r9, r11 + addeq r8, r8, r14 + addeq r9, r9, r14 + ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + and r6, r14, r6, lsr #2 + and r7, r14, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 + subs r3, r3, #1 +.endm + +.macro RND_XY2_EXPAND align + RND_XY2_IT \align +6: stmfd sp!, {r8-r11} + RND_XY2_IT \align + ldmfd sp!, {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + ldr r14, [r12, #24] @ 0x0F0F0F0F + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + stmia r0, {r4-r5} + add r0, r0, r2 + bge 6b + ldmfd sp!, {r4-r11,pc} +.endm + + .align 8 +function put_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adrl r12, 5f + ands r4, r1, #3 + add r5, r12, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + RND_XY2_EXPAND 0 + + .align 8 +2: + RND_XY2_EXPAND 1 + + .align 8 +3: + RND_XY2_EXPAND 2 + + .align 8 +4: + RND_XY2_EXPAND 3 + +5: + .word 0x03030303 + .word 2b + .word 3b + .word 4b + .word 0x02020202 + .word 0xFCFCFCFC >> 2 + .word 0x0F0F0F0F + .endfunc + + .align 8 +function put_no_rnd_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + stmfd sp!, {r4-r11,lr} @ R14 is also called LR + adrl r12, 5f + ands r4, r1, #3 + add r5, r12, r4, lsl #2 + bic r1, r1, #3 + ldrne pc, [r5] +1: + RND_XY2_EXPAND 0 + + .align 8 +2: + RND_XY2_EXPAND 1 + + .align 8 +3: + RND_XY2_EXPAND 2 + + .align 8 +4: + RND_XY2_EXPAND 3 + +5: + .word 0x03030303 + .word 2b + .word 3b + .word 4b + .word 0x01010101 + .word 0xFCFCFCFC >> 2 + .word 0x0F0F0F0F + .endfunc + +@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride) +function ff_add_pixels_clamped_ARM, export=1 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_iwmmxt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_iwmmxt.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,205 @@ +/* + * iWMMXt optimized DSP utils + * Copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" + +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2b" +#include "dsputil_iwmmxt_rnd_template.c" +#undef DEF +#undef SET_RND +#undef WAVG2B + +#define DEF(x, y) x ## _ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2br" +#include "dsputil_iwmmxt_rnd_template.c" +#undef DEF +#undef SET_RND +#undef WAVG2BR + +// need scheduling +#define OP(AVG) \ + __asm__ volatile ( \ + /* alignment */ \ + "and r12, %[pixels], #7 \n\t" \ + "bic %[pixels], %[pixels], #7 \n\t" \ + "tmcr wcgr1, r12 \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + \ + "1: \n\t" \ + \ + "wldrd wr2, [%[pixels]] \n\t" \ + "wldrd wr3, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "pld [%[pixels]] \n\t" \ + "walignr1 wr5, wr2, wr3 \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + "pld [%[pixels]] \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "subs %[h], %[h], #2 \n\t" \ + "bne 1b \n\t" \ + : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ + : [line_size]"r"(line_size) \ + : "memory", "r12"); +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2br"); +} +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2b"); +} +#undef OP + +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + uint8_t *pixels2 = pixels + line_size; + + __asm__ volatile ( + "mov r12, #4 \n\t" + "1: \n\t" + "pld [%[pixels], %[line_size2]] \n\t" + "pld [%[pixels2], %[line_size2]] \n\t" + "wldrd wr4, [%[pixels]] \n\t" + "wldrd wr5, [%[pixels2]] \n\t" + "pld [%[block], #32] \n\t" + "wunpckelub wr6, wr4 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wunpckehub wr7, wr4 \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "wunpckelub wr8, wr5 \n\t" + "wldrd wr2, [%[block], #16] \n\t" + "wunpckehub wr9, wr5 \n\t" + "wldrd wr3, [%[block], #24] \n\t" + "add %[block], %[block], #32 \n\t" + "waddhss wr10, wr0, wr6 \n\t" + "waddhss wr11, wr1, wr7 \n\t" + "waddhss wr12, wr2, wr8 \n\t" + "waddhss wr13, wr3, wr9 \n\t" + "wpackhus wr14, wr10, wr11 \n\t" + "wpackhus wr15, wr12, wr13 \n\t" + "wstrd wr14, [%[pixels]] \n\t" + "add %[pixels], %[pixels], %[line_size2] \n\t" + "subs r12, r12, #1 \n\t" + "wstrd wr15, [%[pixels2]] \n\t" + "add %[pixels2], %[pixels2], %[line_size2] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) + : [line_size2]"r"(line_size << 1) + : "cc", "memory", "r12"); +} + +static void clear_blocks_iwmmxt(DCTELEM *blocks) +{ + __asm__ volatile( + "wzero wr0 \n\t" + "mov r1, #(128 * 6 / 32) \n\t" + "1: \n\t" + "wstrd wr0, [%0] \n\t" + "wstrd wr0, [%0, #8] \n\t" + "wstrd wr0, [%0, #16] \n\t" + "wstrd wr0, [%0, #24] \n\t" + "subs r1, r1, #1 \n\t" + "add %0, %0, #32 \n\t" + "bne 1b \n\t" + : "+r"(blocks) + : + : "r1" + ); +} + +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + return; +} + +/* A run time test is not simple. If this file is compiled in + * then we should install the functions + */ +int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */ + +void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) +{ + if (avctx->dsp_mask) { + if (avctx->dsp_mask & FF_MM_FORCE) + mm_flags |= (avctx->dsp_mask & 0xffff); + else + mm_flags &= ~(avctx->dsp_mask & 0xffff); + } + + if (!(mm_flags & FF_MM_IWMMXT)) return; + + c->add_pixels_clamped = add_pixels_clamped_iwmmxt; + + c->clear_blocks = clear_blocks_iwmmxt; + + c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; + c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; + c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; + + c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; + c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; + c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; + + c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; + + c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; + c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; + c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; + c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; +} diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_iwmmxt_rnd_template.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_iwmmxt_rnd_template.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,1114 @@ +/* + * iWMMXt optimized DSP utils + * copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wldrd wr2, [r5] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "wldrd wr0, [%[block]] \n\t" + "pld [r4] \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "pld [r4, #32] \n\t" + "wldrd wr2, [r5] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wldrd wr3, [r5, #8] \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr9, wr9, wr1 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + WAVG2B" wr11, wr11, wr3 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wstrd wr0, [%[block]] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wldrd wr13, [r5, #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr1, wr1, wr11 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + WAVG2B" wr3, wr3, wr13 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "pld [%[block]] \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + :"r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "cc", "memory", "r12"); +} + +void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "pld [%[block], #32] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_neon.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_neon.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,169 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); + +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + +void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); + +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) +{ + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + + c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; + + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; +} diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_neon_s.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_neon_s.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,274 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + .fpu neon + .text + + .macro pixels16 avg=0 +.if \avg + mov ip, r0 +.endif +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 + vld1.64 {d4, d5}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d6, d7}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] +.if \avg + vld1.64 {d16,d17}, [ip], r2 + vrhadd.u8 q0, q0, q8 + vld1.64 {d18,d19}, [ip], r2 + vrhadd.u8 q1, q1, q9 + vld1.64 {d20,d21}, [ip], r2 + vrhadd.u8 q2, q2, q10 + vld1.64 {d22,d23}, [ip], r2 + vrhadd.u8 q3, q3, q11 +.endif + subs r3, r3, #4 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d2, d3}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + \vhadd q0, q0, q1 + vext.8 q3, q2, q3, #1 + \vhadd q2, q2, q3 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_y2 vhadd=vrhadd.u8 + push {lr} + add ip, r1, r2 + lsl lr, r2, #1 + vld1.64 {d0, d1}, [r1], lr + vld1.64 {d2, d3}, [ip], lr +1: subs r3, r3, #2 + \vhadd q2, q0, q1 + vld1.64 {d0, d1}, [r1], lr + \vhadd q3, q0, q1 + vld1.64 {d2, d3}, [ip], lr + pld [r1] + pld [ip] + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + pop {pc} + .endm + + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 + push {lr} + lsl lr, r2, #1 + add ip, r1, r2 + vld1.64 {d0-d2}, [r1], lr + vld1.64 {d4-d6}, [ip], lr +.if \no_rnd + vmov.i16 q13, #1 +.endif + pld [r1] + pld [ip] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 +1: subs r3, r3, #2 + vld1.64 {d0-d2}, [r1], lr + vadd.u16 q12, q8, q9 + pld [r1] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + \vshrn d28, q12, #2 +.if \no_rnd + vadd.u16 q1, q1, q13 +.endif + \vshrn d29, q1, #2 + vaddl.u8 q8, d0, d30 + vld1.64 {d2-d4}, [ip], lr + vaddl.u8 q10, d1, d31 + vst1.64 {d28,d29}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [ip] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + \vshrn d30, q12, #2 +.if \no_rnd + vadd.u16 q0, q0, q13 +.endif + \vshrn d31, q0, #2 + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.64 {d30,d31}, [r0,:128], r2 + bgt 1b + pop {pc} + .endm + + .macro pixels8 +1: vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + subs r3, r3, #4 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + vst1.64 {d2}, [r0,:64], r2 + vst1.64 {d3}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0, d1}, [r1], r2 + vext.8 d1, d0, d1, #1 + vld1.64 {d2, d3}, [r1], r2 + vext.8 d3, d2, d3, #1 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vswp d1, d2 + \vhadd q0, q0, q1 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_y2 vhadd=vrhadd.u8 + push {lr} + add ip, r1, r2 + lsl lr, r2, #1 + vld1.64 {d0}, [r1], lr + vld1.64 {d1}, [ip], lr +1: subs r3, r3, #2 + \vhadd d4, d0, d1 + vld1.64 {d0}, [r1], lr + \vhadd d5, d0, d1 + vld1.64 {d1}, [ip], lr + pld [r1] + pld [ip] + vst1.64 {d4}, [r0,:64], r2 + vst1.64 {d5}, [r0,:64], r2 + bne 1b + pop {pc} + .endm + + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 + push {lr} + lsl lr, r2, #1 + add ip, r1, r2 + vld1.64 {d0, d1}, [r1], lr + vld1.64 {d2, d3}, [ip], lr +.if \no_rnd + vmov.i16 q11, #1 +.endif + pld [r1] + pld [ip] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 +1: subs r3, r3, #2 + vld1.64 {d0, d1}, [r1], lr + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vaddl.u8 q8, d0, d4 + \vshrn d5, q10, #2 + vld1.64 {d2, d3}, [ip], lr + vadd.u16 q10, q8, q9 + pld [ip] +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vst1.64 {d5}, [r0,:64], r2 + \vshrn d7, q10, #2 + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.64 {d7}, [r0,:64], r2 + bgt 1b + pop {pc} + .endm + + .macro pixfunc pfx name suf rnd_op args:vararg +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd_op \args + .endfunc + .endm + + .macro pixfunc2 pfx name args:vararg + pixfunc \pfx \name + pixfunc \pfx \name \args + .endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov r3, #16 + .endfunc + + pixfunc put_ pixels16 + pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov r3, #16 + .endfunc + + pixfunc avg_ pixels16,, 1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov r3, #8 + .endfunc + + pixfunc put_ pixels8 + pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 diff -r c30b92cf446b -r 9281a8a9387a arm/dsputil_vfp.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_vfp.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + .fpu neon @ required for gas to accept UAL syntax +/* + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle + * throughput for almost all the instructions (except for double precision + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very + * important for performance. One more interesting feature is that VFP has + * independent load/store and arithmetics pipelines, so it is possible to make + * them work simultaneously and get more than 1 operation per cycle. Load/store + * pipeline can process 2 single precision floating point values per cycle and + * supports bulk loads and stores for large sets of registers. Arithmetic operations + * can be done on vectors, which allows to keep the arithmetics pipeline busy, + * while the processor may issue and execute other instructions. Detailed + * optimization manuals can be found at http://www.arm.com + */ + +/** + * ARM VFP optimized implementation of 'vector_fmul_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) +function ff_vector_fmul_vfp, export=1 + vpush {d8-d15} + mov r3, r0 + fmrx r12, fpscr + orr r12, r12, #(3 << 16) /* set vector size to 4 */ + fmxr fpscr, r12 + + vldmia r3!, {s0-s3} + vldmia r1!, {s8-s11} + vldmia r3!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s0, s8 +1: + subs r2, r2, #16 + vmul.f32 s12, s4, s12 + vldmiage r3!, {s16-s19} + vldmiage r1!, {s24-s27} + vldmiage r3!, {s20-s23} + vldmiage r1!, {s28-s31} + vmulge.f32 s24, s16, s24 + vstmia r0!, {s8-s11} + vstmia r0!, {s12-s15} + vmulge.f32 s28, s20, s28 + vldmiagt r3!, {s0-s3} + vldmiagt r1!, {s8-s11} + vldmiagt r3!, {s4-s7} + vldmiagt r1!, {s12-s15} + vmulge.f32 s8, s0, s8 + vstmiage r0!, {s24-s27} + vstmiage r0!, {s28-s31} + bgt 1b + + bic r12, r12, #(7 << 16) /* set vector size back to 1 */ + fmxr fpscr, r12 + vpop {d8-d15} + bx lr + .endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + vmulge.f32 s24, s19, s24 + vldmdbgt r2!, {s0-s3} + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + vmulge.f32 s26, s17, s26 + vldmiagt r1!, {s8-s11} + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + vldmdbgt r2!, {s4-s7} + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + vldmiagt r1!, {s12-s15} + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr + .endfunc + +#ifdef HAVE_ARMV6 +/** + * ARM VFP optimized float to int16 conversion. + * Assume that len is a positive number and is multiple of 8, destination + * buffer is at least 4 bytes aligned (8 bytes alignment is better for + * performance), little endian byte sex + */ +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) +function ff_float_to_int16_vfp, export=1 + push {r4-r8,lr} + vpush {d8-d11} + vldmia r1!, {s16-s23} + vcvt.s32.f32 s0, s16 + vcvt.s32.f32 s1, s17 + vcvt.s32.f32 s2, s18 + vcvt.s32.f32 s3, s19 + vcvt.s32.f32 s4, s20 + vcvt.s32.f32 s5, s21 + vcvt.s32.f32 s6, s22 + vcvt.s32.f32 s7, s23 +1: + subs r2, r2, #8 + vmov r3, r4, s0, s1 + vmov r5, r6, s2, s3 + vmov r7, r8, s4, s5 + vmov ip, lr, s6, s7 + vldmiagt r1!, {s16-s23} + ssat r4, #16, r4 + ssat r3, #16, r3 + ssat r6, #16, r6 + ssat r5, #16, r5 + pkhbt r3, r3, r4, lsl #16 + pkhbt r4, r5, r6, lsl #16 + vcvtgt.s32.f32 s0, s16 + vcvtgt.s32.f32 s1, s17 + vcvtgt.s32.f32 s2, s18 + vcvtgt.s32.f32 s3, s19 + vcvtgt.s32.f32 s4, s20 + vcvtgt.s32.f32 s5, s21 + vcvtgt.s32.f32 s6, s22 + vcvtgt.s32.f32 s7, s23 + ssat r8, #16, r8 + ssat r7, #16, r7 + ssat lr, #16, lr + ssat ip, #16, ip + pkhbt r5, r7, r8, lsl #16 + pkhbt r6, ip, lr, lsl #16 + stmia r0!, {r3-r6} + bgt 1b + + vpop {d8-d11} + pop {r4-r8,pc} + .endfunc +#endif diff -r c30b92cf446b -r 9281a8a9387a arm/float_arm_vfp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/float_arm_vfp.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" + +void ff_vector_fmul_vfp(float *dst, const float *src, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx) +{ + c->vector_fmul = ff_vector_fmul_vfp; + c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; +#ifdef HAVE_ARMV6 + c->float_to_int16 = ff_float_to_int16_vfp; +#endif +} diff -r c30b92cf446b -r 9281a8a9387a arm/h264dsp_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/h264dsp_neon.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + .fpu neon + + .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 + .endm + + .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 + .endm + + .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 + .endm + +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ + .macro h264_chroma_mc8 avg=0 + push {r4-r7, lr} + ldrd r4, [sp, #20] +.if \avg + mov lr, r0 +.endif + pld [r1] + pld [r1, r2] + + muls r7, r4, r5 + rsb r6, r7, r5, lsl #3 + rsb ip, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + add r5, r1, r2 + + vdup.8 d0, r4 + lsl r4, r2, #1 + vdup.8 d1, ip + vld1.64 {d4, d5}, [r1], r4 + vdup.8 d2, r6 + vld1.64 {d6, d7}, [r5], r4 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + +1: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r1], r4 + vmlal.u8 q8, d6, d2 + vext.8 d5, d4, d5, #1 + vmlal.u8 q8, d7, d3 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vmlal.u8 q9, d7, d1 + vmlal.u8 q9, d4, d2 + vmlal.u8 q9, d5, d3 + vrshrn.u16 d16, q8, #6 + vld1.64 {d6, d7}, [r5], r4 + pld [r1] + vrshrn.u16 d17, q9, #6 +.if \avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + + beq 4f + + add r5, r1, r2 + lsl r4, r2, #1 + vld1.64 {d4}, [r1], r4 + vld1.64 {d6}, [r5], r4 + +3: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d1 + vld1.64 {d4}, [r1], r4 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d1 + vld1.64 {d6}, [r5], r4 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +.if \avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + subs r3, r3, #2 + pld [r1] + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.64 {d4, d5}, [r1], r2 + vld1.64 {d6, d7}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + +5: pld [r1] + subs r3, r3, #2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r1], r2 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d7, d1 + pld [r1] + vext.8 d5, d4, d5, #1 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +.if \avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + vld1.64 {d6, d7}, [r1], r2 + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 5b + + pop {r4-r7, pc} + .endm + +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ + .macro h264_chroma_mc4 avg=0 + push {r4-r7, lr} + ldrd r4, [sp, #20] +.if \avg + mov lr, r0 +.endif + pld [r1] + pld [r1, r2] + + muls r7, r4, r5 + rsb r6, r7, r5, lsl #3 + rsb ip, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + add r5, r1, r2 + + vdup.8 d0, r4 + lsl r4, r2, #1 + vdup.8 d1, ip + vld1.64 {d4}, [r1], r4 + vdup.8 d2, r6 + vld1.64 {d6}, [r5], r4 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + + vtrn.32 d0, d1 + vtrn.32 d2, d3 + +1: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d2 + vld1.64 {d4}, [r1], r4 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d2 + vld1.64 {d6}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 + subs r3, r3, #2 + pld [r1] +.if \avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + vtrn.32 d0, d1 + + beq 4f + + vext.32 d1, d0, d1, #1 + add r5, r1, r2 + lsl r4, r2, #1 + vld1.32 {d4[0]}, [r1], r4 + vld1.32 {d4[1]}, [r5], r4 + +3: pld [r5] + vmull.u8 q8, d4, d0 + vld1.32 {d4[0]}, [r1], r4 + vmull.u8 q9, d4, d1 + vld1.32 {d4[1]}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 +.if \avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + subs r3, r3, #2 + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.64 {d4}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + +5: vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vld1.64 {d4}, [r1], r2 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r1] + vrshrn.u16 d16, q8, #6 +.if \avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + vld1.64 {d6}, [r1], r2 + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 5b + + pop {r4-r7, pc} + .endm + + .text + .align + +function ff_put_h264_chroma_mc8_neon, export=1 + h264_chroma_mc8 + .endfunc + +function ff_avg_h264_chroma_mc8_neon, export=1 + h264_chroma_mc8 avg=1 + .endfunc + +function ff_put_h264_chroma_mc4_neon, export=1 + h264_chroma_mc4 + .endfunc + +function ff_avg_h264_chroma_mc4_neon, export=1 + h264_chroma_mc4 avg=1 + .endfunc + + /* H.264 loop filter */ + + .macro h264_loop_filter_start + ldr ip, [sp] + tst r2, r2 + ldr ip, [ip] + tstne r3, r3 + vmov.32 d24[0], ip + and ip, ip, ip, lsl #16 + bxeq lr + ands ip, ip, ip, lsl #8 + bxlt lr + .endm + + .macro align_push_regs + and ip, sp, #15 + add ip, ip, #32 + sub sp, sp, ip + vst1.64 {d12-d15}, [sp,:128] + sub sp, sp, #32 + vst1.64 {d8-d11}, [sp,:128] + .endm + + .macro align_pop_regs + vld1.64 {d8-d11}, [sp,:128]! + vld1.64 {d12-d15}, [sp,:128], ip + .endm + + .macro h264_loop_filter_luma + vdup.8 q11, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 q6, q8, q0 @ abs(p0 - q0) + vmovl.u16 q12, d24 + vabd.u8 q14, q9, q8 @ abs(p1 - p0) + vsli.16 q12, q12, #8 + vabd.u8 q15, q1, q0 @ abs(q1 - q0) + vsli.32 q12, q12, #16 + vclt.u8 q6, q6, q11 @ < alpha + vdup.8 q11, r3 @ beta + vclt.s8 q7, q12, #0 + vclt.u8 q14, q14, q11 @ < beta + vclt.u8 q15, q15, q11 @ < beta + vbic q6, q6, q7 + vabd.u8 q4, q10, q8 @ abs(p2 - p0) + vand q6, q6, q14 + vabd.u8 q5, q2, q0 @ abs(q2 - q0) + vclt.u8 q4, q4, q11 @ < beta + vand q6, q6, q15 + vclt.u8 q5, q5, q11 @ < beta + vand q4, q4, q6 + vand q5, q5, q6 + vand q12, q12, q6 + vrhadd.u8 q14, q8, q0 + vsub.i8 q6, q12, q4 + vqadd.u8 q7, q9, q12 + vhadd.u8 q10, q10, q14 + vsub.i8 q6, q6, q5 + vhadd.u8 q14, q2, q14 + vmin.u8 q7, q7, q10 + vqsub.u8 q11, q9, q12 + vqadd.u8 q2, q1, q12 + vmax.u8 q7, q7, q11 + vqsub.u8 q11, q1, q12 + vmin.u8 q14, q2, q14 + vmovl.u8 q2, d0 + vmax.u8 q14, q14, q11 + vmovl.u8 q10, d1 + vsubw.u8 q2, q2, d16 + vsubw.u8 q10, q10, d17 + vshl.i16 q2, q2, #2 + vshl.i16 q10, q10, #2 + vaddw.u8 q2, q2, d18 + vaddw.u8 q10, q10, d19 + vsubw.u8 q2, q2, d2 + vsubw.u8 q10, q10, d3 + vrshrn.i16 d4, q2, #3 + vrshrn.i16 d5, q10, #3 + vbsl q4, q7, q9 + vbsl q5, q14, q1 + vneg.s8 q7, q6 + vmovl.u8 q14, d16 + vmin.s8 q2, q2, q6 + vmovl.u8 q6, d17 + vmax.s8 q2, q2, q7 + vmovl.u8 q11, d0 + vmovl.u8 q12, d1 + vaddw.s8 q14, q14, d4 + vaddw.s8 q6, q6, d5 + vsubw.s8 q11, q11, d4 + vsubw.s8 q12, q12, d5 + vqmovun.s16 d16, q14 + vqmovun.s16 d17, q6 + vqmovun.s16 d0, q11 + vqmovun.s16 d1, q12 + .endm + +function ff_h264_v_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + vld1.64 {d0, d1}, [r0,:128], r1 + vld1.64 {d2, d3}, [r0,:128], r1 + vld1.64 {d4, d5}, [r0,:128], r1 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + vld1.64 {d20,d21}, [r0,:128], r1 + vld1.64 {d18,d19}, [r0,:128], r1 + vld1.64 {d16,d17}, [r0,:128], r1 + + align_push_regs + + h264_loop_filter_luma + + sub r0, r0, r1, lsl #1 + vst1.64 {d8, d9}, [r0,:128], r1 + vst1.64 {d16,d17}, [r0,:128], r1 + vst1.64 {d0, d1}, [r0,:128], r1 + vst1.64 {d10,d11}, [r0,:128] + + align_pop_regs + bx lr + .endfunc + +function ff_h264_h_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #4 + vld1.64 {d6}, [r0], r1 + vld1.64 {d20}, [r0], r1 + vld1.64 {d18}, [r0], r1 + vld1.64 {d16}, [r0], r1 + vld1.64 {d0}, [r0], r1 + vld1.64 {d2}, [r0], r1 + vld1.64 {d4}, [r0], r1 + vld1.64 {d26}, [r0], r1 + vld1.64 {d7}, [r0], r1 + vld1.64 {d21}, [r0], r1 + vld1.64 {d19}, [r0], r1 + vld1.64 {d17}, [r0], r1 + vld1.64 {d1}, [r0], r1 + vld1.64 {d3}, [r0], r1 + vld1.64 {d5}, [r0], r1 + vld1.64 {d27}, [r0], r1 + + transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 + + align_push_regs + sub sp, sp, #16 + vst1.64 {d4, d5}, [sp,:128] + sub sp, sp, #16 + vst1.64 {d20,d21}, [sp,:128] + + h264_loop_filter_luma + + vld1.64 {d20,d21}, [sp,:128]! + vld1.64 {d4, d5}, [sp,:128]! + + transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13 + + sub r0, r0, r1, lsl #4 + vst1.64 {d6}, [r0], r1 + vst1.64 {d20}, [r0], r1 + vst1.64 {d8}, [r0], r1 + vst1.64 {d16}, [r0], r1 + vst1.64 {d0}, [r0], r1 + vst1.64 {d10}, [r0], r1 + vst1.64 {d4}, [r0], r1 + vst1.64 {d26}, [r0], r1 + vst1.64 {d7}, [r0], r1 + vst1.64 {d21}, [r0], r1 + vst1.64 {d9}, [r0], r1 + vst1.64 {d17}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d11}, [r0], r1 + vst1.64 {d5}, [r0], r1 + vst1.64 {d27}, [r0], r1 + + align_pop_regs + bx lr + .endfunc + + .macro h264_loop_filter_chroma + vdup.8 d22, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 d26, d16, d0 @ abs(p0 - q0) + vmovl.u8 q2, d0 + vabd.u8 d28, d18, d16 @ abs(p1 - p0) + vsubw.u8 q2, q2, d16 + vsli.16 d24, d24, #8 + vshl.i16 q2, q2, #2 + vabd.u8 d30, d2, d0 @ abs(q1 - q0) + vaddw.u8 q2, q2, d18 + vclt.u8 d26, d26, d22 @ < alpha + vsubw.u8 q2, q2, d2 + vdup.8 d22, r3 @ beta + vclt.s8 d25, d24, #0 + vrshrn.i16 d4, q2, #3 + vclt.u8 d28, d28, d22 @ < beta + vbic d26, d26, d25 + vclt.u8 d30, d30, d22 @ < beta + vand d26, d26, d28 + vneg.s8 d25, d24 + vand d26, d26, d30 + vmin.s8 d4, d4, d24 + vmovl.u8 q14, d16 + vand d4, d4, d26 + vmax.s8 d4, d4, d25 + vmovl.u8 q11, d0 + vaddw.s8 q14, q14, d4 + vsubw.s8 q11, q11, d4 + vqmovun.s16 d16, q14 + vqmovun.s16 d0, q11 + .endm + +function ff_h264_v_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, r1, lsl #1 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d0}, [r0,:64], r1 + vld1.64 {d2}, [r0,:64] + + h264_loop_filter_chroma + + sub r0, r0, r1, lsl #1 + vst1.64 {d16}, [r0,:64], r1 + vst1.64 {d0}, [r0,:64], r1 + + bx lr + .endfunc + +function ff_h264_h_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #2 + vld1.32 {d18[0]}, [r0], r1 + vld1.32 {d16[0]}, [r0], r1 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d2[0]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[1]}, [r0], r1 + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + h264_loop_filter_chroma + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + sub r0, r0, r1, lsl #3 + vst1.32 {d18[0]}, [r0], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d2[0]}, [r0], r1 + vst1.32 {d18[1]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d2[1]}, [r0], r1 + + bx lr + .endfunc + + /* H.264 qpel MC */ + + .macro lowpass_const r + movw \r, #5 + movt \r, #20 + vmov.32 d6[0], \r + .endm + + .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 +.if \narrow + t0 .req q0 + t1 .req q8 +.else + t0 .req \d0 + t1 .req \d1 +.endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vext.8 d18, \r2, \r3, #2 + vmla.i16 t0, q1, d6[1] + vext.8 d19, \r2, \r3, #3 + vaddl.u8 q9, d18, d19 + vext.8 d20, \r2, \r3, #1 + vmls.i16 t0, q2, d6[0] + vext.8 d21, \r2, \r3, #4 + vaddl.u8 q10, d20, d21 + vext.8 d31, \r2, \r3, #5 + vaddl.u8 t1, \r2, d31 + vmla.i16 t1, q9, d6[1] + vmls.i16 t1, q10, d6[0] +.if \narrow + vqrshrun.s16 \d0, t0, #5 + vqrshrun.s16 \d1, t1, #5 +.endif + .unreq t0 + .unreq t1 + .endm + + .macro lowpass_8_1 r0, r1, d0, narrow=1 +.if \narrow + t0 .req q0 +.else + t0 .req \d0 +.endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vmla.i16 t0, q1, d6[1] + vmls.i16 t0, q2, d6[0] +.if \narrow + vqrshrun.s16 \d0, t0, #5 +.endif + .unreq t0 + .endm + + .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d + vext.16 q1, \r0, \r1, #2 + vext.16 q0, \r0, \r1, #3 + vaddl.s16 q9, d2, d0 + vext.16 q2, \r0, \r1, #1 + vaddl.s16 q1, d3, d1 + vext.16 q3, \r0, \r1, #4 + vaddl.s16 q10, d4, d6 + vext.16 \r1, \r0, \r1, #5 + vaddl.s16 q2, d5, d7 + vaddl.s16 q0, \h0, \h1 + vaddl.s16 q8, \l0, \l1 + + vshl.i32 q3, q9, #4 + vshl.i32 q9, q9, #2 + vshl.i32 q15, q10, #2 + vadd.i32 q9, q9, q3 + vadd.i32 q10, q10, q15 + + vshl.i32 q3, q1, #4 + vshl.i32 q1, q1, #2 + vshl.i32 q15, q2, #2 + vadd.i32 q1, q1, q3 + vadd.i32 q2, q2, q15 + + vadd.i32 q9, q9, q8 + vsub.i32 q9, q9, q10 + + vadd.i32 q1, q1, q0 + vsub.i32 q1, q1, q2 + + vrshrn.s32 d18, q9, #10 + vrshrn.s32 d19, q1, #10 + + vqmovun.s16 \d, q9 + .endm + +function put_h264_qpel16_h_lowpass_neon_packed + mov r4, lr + mov ip, #16 + mov r3, #8 + bl put_h264_qpel8_h_lowpass_neon + sub r1, r1, r2, lsl #4 + add r1, r1, #8 + mov ip, #16 + mov lr, r4 + b put_h264_qpel8_h_lowpass_neon + .endfunc + +function put_h264_qpel16_h_lowpass_neon + push {lr} + mov ip, #16 + bl put_h264_qpel8_h_lowpass_neon + sub r0, r0, r3, lsl #4 + sub r1, r1, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + mov ip, #16 + pop {lr} + .endfunc + +function put_h264_qpel8_h_lowpass_neon +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d16,d17}, [r1], r2 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, d0, d16 + vst1.64 {d0}, [r0,:64], r3 + vst1.64 {d16}, [r0,:64], r3 + bne 1b + bx lr + .endfunc + +function put_h264_qpel16_h_lowpass_l2_neon + push {lr} + mov ip, #16 + bl put_h264_qpel8_h_lowpass_l2_neon + sub r0, r0, r2, lsl #4 + sub r1, r1, r2, lsl #4 + sub r3, r3, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + add r3, r3, #8 + mov ip, #16 + pop {lr} + .endfunc + +function put_h264_qpel8_h_lowpass_l2_neon +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d16,d17}, [r1], r2 + vld1.64 {d28}, [r3], r2 + vld1.64 {d29}, [r3], r2 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, d0, d1 + vrhadd.u8 q0, q0, q14 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr + .endfunc + +function put_h264_qpel16_v_lowpass_neon_packed + mov r4, lr + mov r2, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 + b put_h264_qpel8_v_lowpass_neon + .endfunc + +function put_h264_qpel16_v_lowpass_neon + mov r4, lr + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_v_lowpass_neon + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 + .endfunc + +function put_h264_qpel8_v_lowpass_neon + vld1.64 {d8}, [r1], r3 + vld1.64 {d10}, [r1], r3 + vld1.64 {d12}, [r1], r3 + vld1.64 {d14}, [r1], r3 + vld1.64 {d22}, [r1], r3 + vld1.64 {d24}, [r1], r3 + vld1.64 {d26}, [r1], r3 + vld1.64 {d28}, [r1], r3 + vld1.64 {d9}, [r1], r3 + vld1.64 {d11}, [r1], r3 + vld1.64 {d13}, [r1], r3 + vld1.64 {d15}, [r1], r3 + vld1.64 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d10 + lowpass_8 d12, d13, d14, d15, d12, d14 + lowpass_8 d22, d23, d24, d25, d22, d24 + lowpass_8 d26, d27, d28, d29, d26, d28 + transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 + + vst1.64 {d8}, [r0,:64], r2 + vst1.64 {d10}, [r0,:64], r2 + vst1.64 {d12}, [r0,:64], r2 + vst1.64 {d14}, [r0,:64], r2 + vst1.64 {d22}, [r0,:64], r2 + vst1.64 {d24}, [r0,:64], r2 + vst1.64 {d26}, [r0,:64], r2 + vst1.64 {d28}, [r0,:64], r2 + + bx lr + .endfunc + +function put_h264_qpel16_v_lowpass_l2_neon + mov r4, lr + bl put_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_v_lowpass_l2_neon + sub r0, r0, r3, lsl #4 + sub ip, ip, r2, lsl #4 + add r0, r0, #8 + add ip, ip, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl put_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 + .endfunc + +function put_h264_qpel8_v_lowpass_l2_neon + vld1.64 {d8}, [r1], r3 + vld1.64 {d10}, [r1], r3 + vld1.64 {d12}, [r1], r3 + vld1.64 {d14}, [r1], r3 + vld1.64 {d22}, [r1], r3 + vld1.64 {d24}, [r1], r3 + vld1.64 {d26}, [r1], r3 + vld1.64 {d28}, [r1], r3 + vld1.64 {d9}, [r1], r3 + vld1.64 {d11}, [r1], r3 + vld1.64 {d13}, [r1], r3 + vld1.64 {d15}, [r1], r3 + vld1.64 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d9 + lowpass_8 d12, d13, d14, d15, d12, d13 + lowpass_8 d22, d23, d24, d25, d22, d23 + lowpass_8 d26, d27, d28, d29, d26, d27 + transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 + + vld1.64 {d0}, [ip], r2 + vld1.64 {d1}, [ip], r2 + vld1.64 {d2}, [ip], r2 + vld1.64 {d3}, [ip], r2 + vld1.64 {d4}, [ip], r2 + vrhadd.u8 q0, q0, q4 + vld1.64 {d5}, [ip], r2 + vrhadd.u8 q1, q1, q6 + vld1.64 {d10}, [ip], r2 + vrhadd.u8 q2, q2, q11 + vld1.64 {d11}, [ip], r2 + + vst1.64 {d0}, [r0,:64], r3 + vst1.64 {d1}, [r0,:64], r3 + vrhadd.u8 q5, q5, q13 + vst1.64 {d2}, [r0,:64], r3 + vst1.64 {d3}, [r0,:64], r3 + vst1.64 {d4}, [r0,:64], r3 + vst1.64 {d5}, [r0,:64], r3 + vst1.64 {d10}, [r0,:64], r3 + vst1.64 {d11}, [r0,:64], r3 + + bx lr + .endfunc + +function put_h264_qpel8_hv_lowpass_neon_top + lowpass_const ip + mov ip, #12 +1: vld1.64 {d0, d1}, [r1], r3 + vld1.64 {d16,d17}, [r1], r3 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 + vst1.64 {d22-d25}, [r4,:128]! + bne 1b + + vld1.64 {d0, d1}, [r1] + lowpass_8_1 d0, d1, q12, narrow=0 + + mov ip, #-16 + add r4, r4, ip + vld1.64 {d30,d31}, [r4,:128], ip + vld1.64 {d20,d21}, [r4,:128], ip + vld1.64 {d18,d19}, [r4,:128], ip + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d14,d15}, [r4,:128], ip + vld1.64 {d12,d13}, [r4,:128], ip + vld1.64 {d10,d11}, [r4,:128], ip + vld1.64 {d8, d9}, [r4,:128], ip + vld1.64 {d6, d7}, [r4,:128], ip + vld1.64 {d4, d5}, [r4,:128], ip + vld1.64 {d2, d3}, [r4,:128], ip + vld1.64 {d0, d1}, [r4,:128] + + swap4 d1, d3, d5, d7, d8, d10, d12, d14 + transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 + + swap4 d17, d19, d21, d31, d24, d26, d28, d22 + transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 + + vst1.64 {d30,d31}, [r4,:128]! + vst1.64 {d6, d7}, [r4,:128]! + vst1.64 {d20,d21}, [r4,:128]! + vst1.64 {d4, d5}, [r4,:128]! + vst1.64 {d18,d19}, [r4,:128]! + vst1.64 {d2, d3}, [r4,:128]! + vst1.64 {d16,d17}, [r4,:128]! + vst1.64 {d0, d1}, [r4,:128] + + lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 + lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 + lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 + lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 + + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128] + lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 + + transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 + + bx lr + .endfunc + +function put_h264_qpel8_hv_lowpass_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top + vst1.64 {d12}, [r0,:64], r2 + vst1.64 {d13}, [r0,:64], r2 + vst1.64 {d14}, [r0,:64], r2 + vst1.64 {d15}, [r0,:64], r2 + vst1.64 {d8}, [r0,:64], r2 + vst1.64 {d9}, [r0,:64], r2 + vst1.64 {d10}, [r0,:64], r2 + vst1.64 {d11}, [r0,:64], r2 + + mov lr, r10 + bx lr + .endfunc + +function put_h264_qpel8_hv_lowpass_l2_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top + + vld1.64 {d0, d1}, [r2,:128]! + vld1.64 {d2, d3}, [r2,:128]! + vrhadd.u8 q0, q0, q6 + vld1.64 {d4, d5}, [r2,:128]! + vrhadd.u8 q1, q1, q7 + vld1.64 {d6, d7}, [r2,:128]! + vrhadd.u8 q2, q2, q4 + + vst1.64 {d0}, [r0,:64], r3 + vrhadd.u8 q3, q3, q5 + vst1.64 {d1}, [r0,:64], r3 + vst1.64 {d2}, [r0,:64], r3 + vst1.64 {d3}, [r0,:64], r3 + vst1.64 {d4}, [r0,:64], r3 + vst1.64 {d5}, [r0,:64], r3 + vst1.64 {d6}, [r0,:64], r3 + vst1.64 {d7}, [r0,:64], r3 + + mov lr, r10 + bx lr + .endfunc + +function put_h264_qpel16_hv_lowpass_neon + mov r9, lr + bl put_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + bl put_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b put_h264_qpel8_hv_lowpass_neon + .endfunc + +function put_h264_qpel16_hv_lowpass_l2_neon + mov r9, lr + sub r2, r4, #256 + bl put_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r3, lsl #4 + add r0, r0, #8 + bl put_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b put_h264_qpel8_hv_lowpass_l2_neon + .endfunc + +function ff_put_h264_qpel8_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + mov ip, #8 + b put_h264_qpel8_h_lowpass_l2_neon + .endfunc + +function ff_put_h264_qpel8_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + mov ip, #8 + b put_h264_qpel8_h_lowpass_neon + .endfunc + +function ff_put_h264_qpel8_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + mov ip, #8 + b put_h264_qpel8_h_lowpass_l2_neon + .endfunc + +function ff_put_h264_qpel8_mc01_neon, export=1 + push {lr} + mov ip, r1 +put_h264_qpel8_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl put_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + pop {pc} + .endfunc + +function ff_put_h264_qpel8_mc11_neon, export=1 + push {r0, r1, r2, lr} +put_h264_qpel8_mc11: + lowpass_const r3 + sub sp, sp, #64 + mov r0, sp + sub r1, r1, #2 + mov r3, #8 + mov ip, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + ldrd r0, [sp, #128] + mov r3, r2 + add ip, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #8 + bl put_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + add sp, sp, #76 + pop {pc} + .endfunc + +function ff_put_h264_qpel8_mc21_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +put_h264_qpel8_mc21: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(8*8+16*12) + sub r1, r1, #2 + mov r3, #8 + mov r0, sp + mov ip, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub r2, r4, #64 + bl put_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4, r10, r11, pc} + .endfunc + +function ff_put_h264_qpel8_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r2, lr} + sub r1, r1, #1 + b put_h264_qpel8_mc11 + .endfunc + +function ff_put_h264_qpel8_mc02_neon, export=1 + push {lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl put_h264_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {pc} + .endfunc + +function ff_put_h264_qpel8_mc12_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +put_h264_qpel8_mc12: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(8*8+16*12) + sub r1, r1, r2, lsl #1 + mov r3, r2 + mov r2, #8 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel8_v_lowpass_neon + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + sub r2, r4, #64 + bl put_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4, r10, r11, pc} + .endfunc + +function ff_put_h264_qpel8_mc22_neon, export=1 + push {r4, r10, r11, lr} + mov r11, sp + bic sp, sp, #15 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl put_h264_qpel8_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r10, r11, pc} + .endfunc + +function ff_put_h264_qpel8_mc32_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, #1 + b put_h264_qpel8_mc12 + .endfunc + +function ff_put_h264_qpel8_mc03_neon, export=1 + push {lr} + add ip, r1, r2 + b put_h264_qpel8_mc01 + .endfunc + +function ff_put_h264_qpel8_mc13_neon, export=1 + push {r0, r1, r2, lr} + add r1, r1, r2 + b put_h264_qpel8_mc11 + .endfunc + +function ff_put_h264_qpel8_mc23_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, r2 + b put_h264_qpel8_mc21 + .endfunc + +function ff_put_h264_qpel8_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r2, lr} + add r1, r1, r2 + sub r1, r1, #1 + b put_h264_qpel8_mc11 + .endfunc + +function ff_put_h264_qpel16_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + b put_h264_qpel16_h_lowpass_l2_neon + .endfunc + +function ff_put_h264_qpel16_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + b put_h264_qpel16_h_lowpass_neon + .endfunc + +function ff_put_h264_qpel16_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + b put_h264_qpel16_h_lowpass_l2_neon + .endfunc + +function ff_put_h264_qpel16_mc01_neon, export=1 + push {r4, lr} + mov ip, r1 +put_h264_qpel16_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl put_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + pop {r4, pc} + .endfunc + +function ff_put_h264_qpel16_mc11_neon, export=1 + push {r0, r1, r4, lr} +put_h264_qpel16_mc11: + lowpass_const r3 + sub sp, sp, #256 + mov r0, sp + sub r1, r1, #2 + mov r3, #16 + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon + add r0, sp, #256 + ldrd r0, [r0, #64] + mov r3, r2 + add ip, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #16 + bl put_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + add sp, sp, #(256+8) + pop {r4, pc} + .endfunc + +function ff_put_h264_qpel16_mc21_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +put_h264_qpel16_mc21: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(16*16+16*12) + sub r1, r1, #2 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon_packed + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + bl put_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4-r5, r9-r11, pc} + .endfunc + +function ff_put_h264_qpel16_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, lr} + sub r1, r1, #1 + b put_h264_qpel16_mc11 + .endfunc + +function ff_put_h264_qpel16_mc02_neon, export=1 + push {r4, lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl put_h264_qpel16_v_lowpass_neon + vpop {d8-d15} + pop {r4, pc} + .endfunc + +function ff_put_h264_qpel16_mc12_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +put_h264_qpel16_mc12: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(16*16+16*12) + sub r1, r1, r2, lsl #1 + mov r0, sp + mov r3, r2 + vpush {d8-d15} + bl put_h264_qpel16_v_lowpass_neon_packed + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + mov r2, r3 + bl put_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4-r5, r9-r11, pc} + .endfunc + +function ff_put_h264_qpel16_mc22_neon, export=1 + push {r4, r9-r11, lr} + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl put_h264_qpel16_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r9-r11, pc} + .endfunc + +function ff_put_h264_qpel16_mc32_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, #1 + b put_h264_qpel16_mc12 + .endfunc + +function ff_put_h264_qpel16_mc03_neon, export=1 + push {r4, lr} + add ip, r1, r2 + b put_h264_qpel16_mc01 + .endfunc + +function ff_put_h264_qpel16_mc13_neon, export=1 + push {r0, r1, r4, lr} + add r1, r1, r2 + b put_h264_qpel16_mc11 + .endfunc + +function ff_put_h264_qpel16_mc23_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, r2 + b put_h264_qpel16_mc21 + .endfunc + +function ff_put_h264_qpel16_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, lr} + add r1, r1, r2 + sub r1, r1, #1 + b put_h264_qpel16_mc11 + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/h264idct_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/h264idct_neon.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + .fpu neon + + .text + +function ff_h264_idct_add_neon, export=1 + mov r3, #(1<<5) + vmov.i16 d16, #0 + vmov.16 d16[0], r3 + vld1.64 {d0-d3}, [r1,:128] + vadd.i16 d0, d0, d16 + + vswp d1, d2 + vadd.i16 d4, d0, d1 + vshr.s16 q8, q1, #1 + vsub.i16 d5, d0, d1 + vadd.i16 d6, d2, d17 + vsub.i16 d7, d16, d3 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vtrn.16 d0, d1 + vtrn.16 d3, d2 + vtrn.32 d0, d3 + vtrn.32 d1, d2 + + vadd.i16 d4, d0, d3 + vld1.32 {d18[0]}, [r0,:32], r2 + vswp d1, d3 + vshr.s16 q8, q1, #1 + vld1.32 {d19[1]}, [r0,:32], r2 + vsub.i16 d5, d0, d1 + vld1.32 {d18[1]}, [r0,:32], r2 + vadd.i16 d6, d16, d3 + vld1.32 {d19[0]}, [r0,:32], r2 + vsub.i16 d7, d2, d17 + sub r0, r0, r2, lsl #2 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vshr.s16 q0, q0, #6 + vshr.s16 q1, q1, #6 + + vaddw.u8 q0, q0, d18 + vaddw.u8 q1, q1, d19 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + + bx lr + .endfunc + +function ff_h264_idct_dc_add_neon, export=1 + vld1.16 {d2[],d3[]}, [r1,:16] + vrshr.s16 q1, q1, #6 + vld1.32 {d0[0]}, [r0,:32], r2 + vld1.32 {d0[1]}, [r0,:32], r2 + vaddw.u8 q2, q1, d0 + vld1.32 {d1[0]}, [r0,:32], r2 + vld1.32 {d1[1]}, [r0,:32], r2 + vaddw.u8 q1, q1, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q1 + sub r0, r0, r2, lsl #2 + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + bx lr + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/jrevdct_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/jrevdct_arm.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,388 @@ +/* + C-like prototype : + void j_rev_dct_ARM(DCTBLOCK data) + + With DCTBLOCK being a pointer to an array of 64 'signed shorts' + + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +#include "asm.S" + +#define FIX_0_298631336 2446 +#define FIX_0_541196100 4433 +#define FIX_0_765366865 6270 +#define FIX_1_175875602 9633 +#define FIX_1_501321110 12299 +#define FIX_2_053119869 16819 +#define FIX_3_072711026 25172 +#define FIX_M_0_390180644 -3196 +#define FIX_M_0_899976223 -7373 +#define FIX_M_1_847759065 -15137 +#define FIX_M_1_961570560 -16069 +#define FIX_M_2_562915447 -20995 +#define FIX_0xFFFF 0xFFFF + +#define FIX_0_298631336_ID 0 +#define FIX_0_541196100_ID 4 +#define FIX_0_765366865_ID 8 +#define FIX_1_175875602_ID 12 +#define FIX_1_501321110_ID 16 +#define FIX_2_053119869_ID 20 +#define FIX_3_072711026_ID 24 +#define FIX_M_0_390180644_ID 28 +#define FIX_M_0_899976223_ID 32 +#define FIX_M_1_847759065_ID 36 +#define FIX_M_1_961570560_ID 40 +#define FIX_M_2_562915447_ID 44 +#define FIX_0xFFFF_ID 48 + .text + .align + +function j_rev_dct_ARM, export=1 + stmdb sp!, { r4 - r12, lr } @ all callee saved regs + + sub sp, sp, #4 @ reserve some space on the stack + str r0, [ sp ] @ save the DCT pointer to the stack + + mov lr, r0 @ lr = pointer to the current row + mov r12, #8 @ r12 = row-counter + add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array +row_loop: + ldrsh r0, [lr, # 0] @ r0 = 'd0' + ldrsh r2, [lr, # 2] @ r2 = 'd2' + + @ Optimization for row that have all items except the first set to 0 + @ (this works as the DCTELEMS are always 4-byte aligned) + ldr r5, [lr, # 0] + ldr r6, [lr, # 4] + ldr r3, [lr, # 8] + ldr r4, [lr, #12] + orr r3, r3, r4 + orr r3, r3, r6 + orrs r5, r3, r5 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' + orrs r3, r3, r2 + beq empty_row + + ldrsh r1, [lr, # 8] @ r1 = 'd1' + ldrsh r4, [lr, # 4] @ r4 = 'd4' + ldrsh r6, [lr, # 6] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r7, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r7, r3, r7 @ r7 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r7 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r7 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 + + stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + ldrsh r3, [lr, #10] @ r3 = 'd3' + ldrsh r5, [lr, #12] @ r5 = 'd5' + ldrsh r7, [lr, #14] @ r7 = 'd7' + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 @ r8 = z3 + z4 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) + add r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 0] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) + sub r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #14] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) + add r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 2] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) + sub r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #12] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) + add r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 4] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) + sub r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #10] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) + add r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 6] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) + sub r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 8] + + @ End of row loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + beq start_column_loop + +empty_row: + ldr r1, [r11, #FIX_0xFFFF_ID] + mov r0, r0, lsl #2 + and r0, r0, r1 + add r0, r0, r0, lsl #16 + str r0, [lr, # 0] + str r0, [lr, # 4] + str r0, [lr, # 8] + str r0, [lr, #12] + +end_of_row_loop: + @ End of loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + +start_column_loop: + @ Start of column loop + ldr lr, [ sp ] + mov r12, #8 +column_loop: + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r1, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r1, r3, r1 @ r1 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r1 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r1 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 + + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' + + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) + orr r9, r1, r3 + orr r10, r5, r7 + orrs r10, r9, r10 + beq empty_odd_column + + stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + add r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 0*8)] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + sub r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + add r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 2*8)] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + sub r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + add r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 4*8)] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + sub r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + add r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 6*8)] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + sub r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + beq the_end + +empty_odd_column: + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + add r0, r0, #(1<<17) + mov r0, r0, asr #18 + strh r0, [lr, #( 0*8)] + strh r0, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + add r4, r4, #(1<<17) + mov r4, r4, asr #18 + strh r4, [lr, #( 2*8)] + strh r4, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + add r6, r6, #(1<<17) + mov r6, r6, asr #18 + strh r6, [lr, #( 4*8)] + strh r6, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + add r2, r2, #(1<<17) + mov r2, r2, asr #18 + strh r2, [lr, #( 6*8)] + strh r2, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + +the_end: + @ The end.... + add sp, sp, #4 + ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return + +const_array: + .align + .word FIX_0_298631336 + .word FIX_0_541196100 + .word FIX_0_765366865 + .word FIX_1_175875602 + .word FIX_1_501321110 + .word FIX_2_053119869 + .word FIX_3_072711026 + .word FIX_M_0_390180644 + .word FIX_M_0_899976223 + .word FIX_M_1_847759065 + .word FIX_M_1_961570560 + .word FIX_M_2_562915447 + .word FIX_0xFFFF diff -r c30b92cf446b -r 9281a8a9387a arm/mathops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/mathops.h Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,93 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MATHOPS_H +#define AVCODEC_ARM_MATHOPS_H + +#include +#include "libavutil/common.h" + +# define MULL MULL +static inline av_const int MULL(int a, int b, unsigned shift) +{ + int lo, hi; + __asm__("smull %0, %1, %2, %3 \n\t" + "mov %0, %0, lsr %4 \n\t" + "add %1, %0, %1, lsl %5 \n\t" + : "=&r"(lo), "=&r"(hi) + : "r"(b), "r"(a), "i"(shift), "i"(32-shift)); + return hi; +} + +#define MULH MULH +#ifdef HAVE_ARMV6 +static inline av_const int MULH(int a, int b) +{ + int r; + __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} +#else +static inline av_const int MULH(int a, int b) +{ + int lo, hi; + __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a)); + return hi; +} +#endif + +static inline av_const int64_t MUL64(int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x; + __asm__ ("smull %0, %1, %2, %3" + : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b)); + return x.x; +} +#define MUL64 MUL64 + +static inline av_const int64_t MAC64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + __asm__ ("smlal %0, %1, %2, %3" + : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b)); + return x.x; +} +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) +#define MLS64(d, a, b) MAC64(d, -(a), b) + +#if defined(HAVE_ARMV5TE) + +/* signed 16x16 -> 32 multiply add accumulate */ +# define MAC16(rt, ra, rb) \ + __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); + +/* signed 16x16 -> 32 multiply */ +# define MUL16 MUL16 +static inline av_const MUL16(int ra, int rb) +{ + int rt; + __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); + return rt; +} + +#endif + +#endif /* AVCODEC_ARM_MATHOPS_H */ diff -r c30b92cf446b -r 9281a8a9387a arm/mpegvideo_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/mpegvideo_arm.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2002 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" + +void MPV_common_init_iwmmxt(MpegEncContext *s); +void MPV_common_init_armv5te(MpegEncContext *s); + +void MPV_common_init_arm(MpegEncContext *s) +{ + /* IWMMXT support is a superset of armv5te, so + * allow optimized functions for armv5te unless + * a better iwmmxt function exists + */ +#ifdef HAVE_ARMV5TE + MPV_common_init_armv5te(s); +#endif +#ifdef HAVE_IWMMXT + MPV_common_init_iwmmxt(s); +#endif +} diff -r c30b92cf446b -r 9281a8a9387a arm/mpegvideo_armv5te.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/mpegvideo_armv5te.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,100 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" + +void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count); + +#ifdef ENABLE_ARM_TESTS +/** + * h263 dequantizer supplementary function, it is performance critical and needs to + * have optimized implementations for each architecture. Is also used as a reference + * implementation in regression tests + */ +static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) +{ + int i, level; + for (i = 0; i < count; i++) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } +} +#endif + +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); + block[0] = level; +} + +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); +} + +void MPV_common_init_armv5te(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; +} diff -r c30b92cf446b -r 9281a8a9387a arm/mpegvideo_armv5te_s.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/mpegvideo_armv5te_s.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,117 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + +/* + * Special optimized version of dct_unquantize_h263_helper_c, it + * requires the block to be at least 8 bytes aligned, and may process + * more elements than requested. But it is guaranteed to never + * process more than 64 elements provided that count argument is <= 64, + * so it is safe. This function is optimized for a common distribution + * of values for nCoeffs (they are mostly multiple of 8 plus one or + * two extra elements). So this function processes data as 8 elements + * per loop iteration and contains optional 2 elements processing in + * the end. + * + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) + */ +function ff_dct_unquantize_h263_armv5te, export=1 + push {r4-r9,lr} + mov ip, #0 + subs r3, r3, #2 + ble 2f + ldrd r4, [r0, #0] +1: + ldrd r6, [r0, #8] + + rsbs r9, ip, r4, asr #16 + addgt r9, r2, #0 + rsblt r9, r2, #0 + smlatbne r9, r4, r1, r9 + + rsbs lr, ip, r5, asr #16 + addgt lr, r2, #0 + rsblt lr, r2, #0 + smlatbne lr, r5, r1, lr + + rsbs r8, ip, r4, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r4, r4, r1, r8 + + rsbs r8, ip, r5, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r5, r5, r1, r8 + + strh r4, [r0], #2 + strh r9, [r0], #2 + strh r5, [r0], #2 + strh lr, [r0], #2 + + rsbs r9, ip, r6, asr #16 + addgt r9, r2, #0 + rsblt r9, r2, #0 + smlatbne r9, r6, r1, r9 + + rsbs lr, ip, r7, asr #16 + addgt lr, r2, #0 + rsblt lr, r2, #0 + smlatbne lr, r7, r1, lr + + rsbs r8, ip, r6, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r6, r6, r1, r8 + + rsbs r8, ip, r7, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r7, r7, r1, r8 + + strh r6, [r0], #2 + strh r9, [r0], #2 + strh r7, [r0], #2 + strh lr, [r0], #2 + + subs r3, r3, #8 + ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ + bgt 1b + + adds r3, r3, #2 + pople {r4-r9,pc} +2: + ldrsh r9, [r0, #0] + ldrsh lr, [r0, #2] + mov r8, r2 + cmp r9, #0 + rsblt r8, r2, #0 + smlabbne r9, r9, r1, r8 + mov r8, r2 + cmp lr, #0 + rsblt r8, r2, #0 + smlabbne lr, lr, r1, r8 + strh r9, [r0], #2 + strh lr, [r0], #2 + pop {r4-r9,pc} + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/mpegvideo_iwmmxt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/mpegvideo_iwmmxt.c Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,119 @@ +/* + * copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" + +static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + DCTELEM *block_orig = block; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + __asm__ volatile ( +/* "movd %1, %%mm6 \n\t" //qmul */ +/* "packssdw %%mm6, %%mm6 \n\t" */ +/* "packssdw %%mm6, %%mm6 \n\t" */ + "tbcsth wr6, %[qmul] \n\t" +/* "movd %2, %%mm5 \n\t" //qadd */ +/* "packssdw %%mm5, %%mm5 \n\t" */ +/* "packssdw %%mm5, %%mm5 \n\t" */ + "tbcsth wr5, %[qadd] \n\t" + "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ + "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ + "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ + "1: \n\t" + "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ + "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ + "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ + "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ +/* "movq (%0, %3), %%mm2 \n\t" */ +/* "movq 8(%0, %3), %%mm3 \n\t" */ + "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ + "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ + "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ + "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ + "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ + "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ + "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ + "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ + "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ + "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ + "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ + "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ + "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ + "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ + "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ + "subs %[i], %[i], #1 \n\t" + "bne 1b \n\t" /* "jng 1b \n\t" */ + :[block]"+r"(block) + :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) + :"memory"); + + block_orig[0] = level; +} + +#if 0 +static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); +} +#endif + +void MPV_common_init_iwmmxt(MpegEncContext *s) +{ + if (!(mm_flags & FF_MM_IWMMXT)) return; + + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; +#if 0 + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; +#endif +} diff -r c30b92cf446b -r 9281a8a9387a arm/simple_idct_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/simple_idct_arm.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,486 @@ +/* + * simple_idct_arm.S + * Copyright (C) 2002 Frederic 'dilb' Boulay. + * + * Author: Frederic Boulay + * + * The function defined in this file is derived from the simple_idct function + * from the libavcodec library part of the FFmpeg project. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +/* useful constants for the algorithm, they are save in __constant_ptr__ at */ +/* the end of the source code.*/ +#define W1 22725 +#define W2 21407 +#define W3 19266 +#define W4 16383 +#define W5 12873 +#define W6 8867 +#define W7 4520 +#define MASK_MSHW 0xFFFF0000 + +/* offsets of the constants in the vector */ +#define offW1 0 +#define offW2 4 +#define offW3 8 +#define offW4 12 +#define offW5 16 +#define offW6 20 +#define offW7 24 +#define offMASK_MSHW 28 + +#define ROW_SHIFT 11 +#define ROW_SHIFT2MSHW (16-11) +#define COL_SHIFT 20 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ + + + .text + +function simple_idct_ARM, export=1 + @@ void simple_idct_ARM(int16_t *block) + @@ save stack for reg needed (take all of them), + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block + @@ so it must not be overwritten, if it is not saved!! + @@ R12 is another scratch register, so it should not be saved too + @@ save all registers + stmfd sp!, {r4-r11, r14} @ R14 is also called LR + @@ at this point, R0=block, other registers are free. + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. + add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it + @@ add 2 temporary variables in the stack: R0 and R14 + sub sp, sp, #8 @ allow 2 local variables + str r0, [sp, #0] @ save block in sp[0] + @@ stack status + @@ sp+4 free + @@ sp+0 R0 (block) + + + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free + + +__row_loop: + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] + ldr r3, [r14, #8] @ R3=ROWr32[2] + ldr r4, [r14, #12] @ R4=ROWr32[3] + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) + @@ else follow the complete algorithm. + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free + orr r5, r4, r3 @ R5=R4 | R3 + orr r5, r5, r2 @ R5=R4 | R3 | R2 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) + beq __end_row_loop + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) + ldrsh r6, [r14, #0] @ R6=ROWr16[0] + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 + beq __almost_empty_row + +__b_evaluation: + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, + @@ R12=__const_ptr_, R14=&block[n] + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 + + @@ MUL16(b0, W1, row[1]); + @@ MUL16(b1, W3, row[1]); + @@ MUL16(b2, W5, row[1]); + @@ MUL16(b3, W7, row[1]); + @@ MAC16(b0, W3, row[3]); + @@ MAC16(b1, -W7, row[3]); + @@ MAC16(b2, -W1, row[3]); + @@ MAC16(b3, -W5, row[3]); + ldr r8, [r12, #offW1] @ R8=W1 + mov r2, r2, asr #16 @ R2=ROWr16[3] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, [r12, #offW3] @ R9=W3 + ldr r10, [r12, #offW5] @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, [r12, #offW7] @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if null avoid muls + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] + beq __end_b_evaluation + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, row[5]); + @@ MAC16(b2, W7, row[5]); + @@ MAC16(b3, W3, row[5]); + @@ MAC16(b1, -W1, row[5]); + @@ MAC16(b0, W7, row[7]); + @@ MAC16(b2, W3, row[7]); + @@ MAC16(b3, -W1, row[7]); + @@ MAC16(b1, -W5, row[7]); + mov r3, r3, asr #16 @ R3=ROWr16[5] + teq r3, #0 @ if null avoid muls + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 + mov r4, r4, asr #16 @ R4=ROWr16[7] + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 + @@ R3 is free now + teq r4, #0 @ if null avoid muls + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 + @@ R4 is free now +__end_b_evaluation: + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation: + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldr r9, [r12, #offW4] @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, [r12, #offW6] @ R10=W6 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) + + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, [r12, #offW2] @ R8=W2 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + teq r2, #0 + beq __end_bef_a_evaluation + + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + + + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #8] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + ldrsh r9, [r14, #12] @ R9=ROWr16[6] + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) + +__end_a_evaluation: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ row[0] = (a0 + b0) >> ROW_SHIFT; + @@ row[1] = (a1 + b1) >> ROW_SHIFT; + @@ row[2] = (a2 + b2) >> ROW_SHIFT; + @@ row[3] = (a3 + b3) >> ROW_SHIFT; + @@ row[4] = (a3 - b3) >> ROW_SHIFT; + @@ row[5] = (a2 - b2) >> ROW_SHIFT; + @@ row[6] = (a1 - b1) >> ROW_SHIFT; + @@ row[7] = (a0 - b0) >> ROW_SHIFT; + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + @@ put 2 16 bits half-words in a 32bits word + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) + ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) + orr r8, r8, r9 + str r8, [r14, #0] + + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) + orr r8, r8, r9 + str r8, [r14, #4] + + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) + orr r8, r8, r9 + str r8, [r14, #8] + + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) + orr r8, r8, r9 + str r8, [r14, #12] + + bal __end_row_loop + +__almost_empty_row: + @@ the row was empty, except ROWr16[0], now, management of this special case + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], + @@ R8=0xFFFF (temp), R9-R11 free + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). + sub r8, r8, #1 @ R8 is now ready. + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 + +__end_row_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. + sub r14, r14, #16 + bne __row_loop + + + + @@ at this point, R0=block, R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. +__col_loop: + +__b_evaluation2: + @@ at this point, R0=block (temp), R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + @@ proceed with b0-b3 first, followed by a0-a3 + @@ MUL16(b0, W1, col[8x1]); + @@ MUL16(b1, W3, col[8x1]); + @@ MUL16(b2, W5, col[8x1]); + @@ MUL16(b3, W7, col[8x1]); + @@ MAC16(b0, W3, col[8x3]); + @@ MAC16(b1, -W7, col[8x3]); + @@ MAC16(b2, -W1, col[8x3]); + @@ MAC16(b3, -W5, col[8x3]); + ldr r8, [r12, #offW1] @ R8=W1 + ldrsh r7, [r14, #16] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, [r12, #offW3] @ R9=W3 + ldr r10, [r12, #offW5] @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, [r12, #offW7] @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldrsh r2, [r14, #48] + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if 0, then avoid muls + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, col[5x8]); + @@ MAC16(b2, W7, col[5x8]); + @@ MAC16(b3, W3, col[5x8]); + @@ MAC16(b1, -W1, col[5x8]); + @@ MAC16(b0, W7, col[7x8]); + @@ MAC16(b2, W3, col[7x8]); + @@ MAC16(b3, -W1, col[7x8]); + @@ MAC16(b1, -W5, col[7x8]); + ldrsh r3, [r14, #80] @ R3=COLr16[5x8] + teq r3, #0 @ if 0 then avoid muls + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8] + ldrsh r4, [r14, #112] @ R4=COLr16[7x8] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 + @@ R3 is free now + teq r4, #0 @ if 0 then avoid muls + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 + @@ R4 is free now +__end_b_evaluation2: + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation2: + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldrsh r6, [r14, #0] + ldr r9, [r12, #offW4] @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, [r12, #offW6] @ R10=W6 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, [r12, #offW2] @ R8=W2 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #64] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + ldrsh r9, [r14, #96] @ R9=ROWr16[6] + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) +__end_a_evaluation2: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); + @@ col[16] = ((a2 + b2) >> COL_SHIFT); + @@ col[24] = ((a3 + b3) >> COL_SHIFT); + @@ col[32] = ((a3 - b3) >> COL_SHIFT); + @@ col[40] = ((a2 - b2) >> COL_SHIFT); + @@ col[48] = ((a1 - b1) >> COL_SHIFT); + @@ col[56] = ((a0 - b0) >> COL_SHIFT); + @@@@@ no optimization here @@@@@ + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #0] + strh r9, [r14, #16] + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #32] + strh r9, [r14, #48] + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #64] + strh r9, [r14, #80] + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #96] + strh r9, [r14, #112] + +__end_col_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. + sub r14, r14, #2 + bne __col_loop + + + + +__end_simple_idct_ARM: + @@ restore registers to previous status! + add sp, sp, #8 @@ the local variables! + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. + + + +@@ kind of sub-function, here not to overload the common case. +__end_bef_a_evaluation: + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + bal __end_a_evaluation + + +__constant_ptr__: @@ see #defines at the beginning of the source code for values. + .align + .word W1 + .word W2 + .word W3 + .word W4 + .word W5 + .word W6 + .word W7 + .word MASK_MSHW diff -r c30b92cf446b -r 9281a8a9387a arm/simple_idct_armv5te.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/simple_idct_armv5te.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,703 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2006 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + + .text + .align +w13: .long W13 +w26: .long W26 +w57: .long W57 + +function idct_row_armv5te + str lr, [sp, #-4]! + + ldrd v1, [a1, #8] + ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ + orrs v1, v1, v2 + cmpeq v1, a4 + cmpeq v1, a3, lsr #16 + beq row_dc_only + + mov v1, #(1<<(ROW_SHIFT-1)) + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ + smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ + ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ + smultb a2, ip, a4 + smulbb lr, ip, a4 + add v2, v1, a2 + sub v3, v1, a2 + sub v4, v1, lr + add v1, v1, lr + + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + ldr lr, [pc, #(w57-.-8)] /* lr = W5 | (W7 << 16) */ + smulbt v5, ip, a3 + smultt v6, lr, a4 + smlatt v5, ip, a4, v5 + smultt a2, ip, a3 + smulbt v7, lr, a3 + sub v6, v6, a2 + smulbt a2, ip, a4 + smultt fp, lr, a3 + sub v7, v7, a2 + smulbt a2, lr, a4 + ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ + sub fp, fp, a2 + + orrs a2, a3, a4 + beq 1f + + smlabt v5, lr, a3, v5 + smlabt v6, ip, a3, v6 + smlatt v5, lr, a4, v5 + smlabt v6, lr, a4, v6 + smlatt v7, lr, a3, v7 + smlatt fp, ip, a3, fp + smulbt a2, ip, a4 + smlatt v7, ip, a4, v7 + sub fp, fp, a2 + + ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ + mov a2, #16384 + sub a2, a2, #1 /* a2 = W4 */ + smulbb a2, a2, a3 /* a2 = W4*row[4] */ + smultb lr, ip, a4 /* lr = W6*row[6] */ + add v1, v1, a2 /* v1 += W4*row[4] */ + add v1, v1, lr /* v1 += W6*row[6] */ + add v4, v4, a2 /* v4 += W4*row[4] */ + sub v4, v4, lr /* v4 -= W6*row[6] */ + smulbb lr, ip, a4 /* lr = W2*row[6] */ + sub v2, v2, a2 /* v2 -= W4*row[4] */ + sub v2, v2, lr /* v2 -= W2*row[6] */ + sub v3, v3, a2 /* v3 -= W4*row[4] */ + add v3, v3, lr /* v3 += W2*row[6] */ + +1: add a2, v1, v5 + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v2, v6 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v3, v7 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + add a2, v4, fp + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, [a1] + + sub a2, v4, fp + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v3, v7 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v2, v6 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + sub a2, v1, v5 + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, [a1, #8] + + ldr pc, [sp], #4 + +row_dc_only: + orr a3, a3, a3, lsl #16 + bic a3, a3, #0xe000 + mov a3, a3, lsl #3 + mov a4, a3 + strd a3, [a1] + strd a3, [a1, #8] + + ldr pc, [sp], #4 + .endfunc + + .macro idct_col + ldr a4, [a1] /* a4 = col[1:0] */ + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ +#if 0 + mov v1, #(1<<(COL_SHIFT-1)) + smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ + smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ + ldr a4, [a1, #(16*4)] +#else + mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ + add v2, v1, a4, asr #16 + rsb v2, v2, v2, lsl #14 + mov a4, a4, lsl #16 + add v1, v1, a4, asr #16 + ldr a4, [a1, #(16*4)] + rsb v1, v1, v1, lsl #14 +#endif + + smulbb lr, ip, a4 + smulbt a3, ip, a4 + sub v3, v1, lr + sub v5, v1, lr + add v7, v1, lr + add v1, v1, lr + sub v4, v2, a3 + sub v6, v2, a3 + add fp, v2, a3 + ldr ip, [pc, #(w26-.-8)] + ldr a4, [a1, #(16*2)] + add v2, v2, a3 + + smulbb lr, ip, a4 + smultb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + add v3, v3, a3 + sub v5, v5, a3 + smulbt lr, ip, a4 + smultt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + add v4, v4, a3 + ldr a4, [a1, #(16*6)] + sub v6, v6, a3 + + smultb lr, ip, a4 + smulbb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + sub v3, v3, a3 + add v5, v5, a3 + smultt lr, ip, a4 + smulbt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + sub v4, v4, a3 + add v6, v6, a3 + + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} + + ldr ip, [pc, #(w13-.-8)] + ldr a4, [a1, #(16*1)] + ldr lr, [pc, #(w57-.-8)] + smulbb v1, ip, a4 + smultb v3, ip, a4 + smulbb v5, lr, a4 + smultb v7, lr, a4 + smulbt v2, ip, a4 + smultt v4, ip, a4 + smulbt v6, lr, a4 + smultt fp, lr, a4 + rsb v4, v4, #0 + ldr a4, [a1, #(16*3)] + rsb v3, v3, #0 + + smlatb v1, ip, a4, v1 + smlatb v3, lr, a4, v3 + smulbb a3, ip, a4 + smulbb a2, lr, a4 + sub v5, v5, a3 + sub v7, v7, a2 + smlatt v2, ip, a4, v2 + smlatt v4, lr, a4, v4 + smulbt a3, ip, a4 + smulbt a2, lr, a4 + sub v6, v6, a3 + ldr a4, [a1, #(16*5)] + sub fp, fp, a2 + + smlabb v1, lr, a4, v1 + smlabb v3, ip, a4, v3 + smlatb v5, lr, a4, v5 + smlatb v7, ip, a4, v7 + smlabt v2, lr, a4, v2 + smlabt v4, ip, a4, v4 + smlatt v6, lr, a4, v6 + ldr a3, [a1, #(16*7)] + smlatt fp, ip, a4, fp + + smlatb v1, lr, a3, v1 + smlabb v3, lr, a3, v3 + smlatb v5, ip, a3, v5 + smulbb a4, ip, a3 + smlatt v2, lr, a3, v2 + sub v7, v7, a4 + smlabt v4, lr, a3, v4 + smulbt a4, ip, a3 + smlatt v6, ip, a3, v6 + sub fp, fp, a4 + .endm + +function idct_col_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + adds a2, a3, v1 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, v2 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1] + subs a3, a3, v1 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, v2 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*7)] + + subs a2, a3, v3 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + sub ip, a4, v4 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*1)] + adds a3, a3, v3 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + add a4, a4, v4 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*6)] + + adds a2, a3, v5 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, v6 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*2)] + subs a3, a3, v5 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, v6 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*5)] + + adds a2, a3, v7 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, fp + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*3)] + subs a3, a3, v7 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, fp + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + str a2, [a1, #(16*4)] + + ldr pc, [sp], #4 + .endfunc + +function idct_col_put_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + ldr lr, [sp, #32] + add a2, a3, v1 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, v2 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + sub a3, a3, v1 + movs a3, a3, asr #20 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + sub a4, a4, v2 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + ldr v1, [sp, #28] + movgt a4, #255 + strh a2, [v1] + add a2, v1, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + rsb v2, lr, lr, lsl #3 + ldmfd sp!, {a3, a4} + strh a2, [v2, v1]! + + sub a2, a3, v3 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub ip, a4, v4 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr]! + add a3, a3, v3 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add a4, a4, v4 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh a2, [v2, -lr]! + + add a2, a3, v5 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, v6 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr]! + sub a3, a3, v5 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub a4, a4, v6 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh a2, [v2, -lr]! + + add a2, a3, v7 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, fp + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr] + sub a3, a3, v7 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub a4, a4, fp + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + strh a2, [v2, -lr] + + ldr pc, [sp], #4 + .endfunc + +function idct_col_add_armv5te + str lr, [sp, #-4]! + + idct_col + + ldr lr, [sp, #36] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr] + add a2, a3, v1 + mov a2, a2, asr #20 + sub a3, a3, v1 + and v1, ip, #255 + adds a2, a2, v1 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v1, a4, v2 + mov v1, v1, asr #20 + adds v1, v1, ip, lsr #8 + movmi v1, #0 + cmp v1, #255 + movgt v1, #255 + orr a2, a2, v1, lsl #8 + ldr v1, [sp, #32] + sub a4, a4, v2 + rsb v2, v1, v1, lsl #3 + ldrh ip, [v2, lr]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + add a2, lr, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + sub a2, a3, v3 + mov a2, a2, asr #20 + add a3, a3, v3 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub v3, a4, v4 + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + add a4, a4, v4 + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + add a2, a3, v5 + mov a2, a2, asr #20 + sub a3, a3, v5 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v3, a4, v6 + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + sub a4, a4, v6 + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + add a2, a3, v7 + mov a2, a2, asr #20 + sub a3, a3, v7 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v3, a4, fp + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + sub a4, a4, fp + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldr pc, [sp], #4 + .endfunc + +function simple_idct_armv5te, export=1 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + +function simple_idct_add_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + +function simple_idct_put_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/simple_idct_armv6.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/simple_idct_armv6.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,433 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2007 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W42 (W4 | (W2 << 16)) +#define W42n (-W4&0xffff | (-W2 << 16)) +#define W46 (W4 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + + .text + .align +w13: .long W13 +w26: .long W26 +w42: .long W42 +w42n: .long W42n +w46: .long W46 +w57: .long W57 + +/* + Compute partial IDCT of single row. + shift = left-shift amount + a1 = source address + a3 = row[2,0] <= 2 cycles + a4 = row[3,1] + ip = w42 <= 2 cycles + + Output in registers v1--v8 +*/ + .macro idct_row shift + ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ + mov a2, #(1<<(\shift-1)) + smlad v1, a3, ip, a2 + smlsd v4, a3, ip, a2 + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ + smlad v2, a3, lr, a2 + smlsd v3, a3, lr, a2 + + smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ + smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ + ldr lr, [a1, #12] /* lr = row[7,5] */ + pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ + pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ + smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ + smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ + smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ + + ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ + smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ + ldr a3, [a1, #4] /* a3 = row[6,4] */ + smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ + ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ + smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ + + smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ + smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ + smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ + smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ + .endm + +/* + Compute partial IDCT of half row. + shift = left-shift amount + a3 = row[2,0] + a4 = row[3,1] + ip = w42 + + Output in registers v1--v8 +*/ + .macro idct_row4 shift + ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ + ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ + mov a2, #(1<<(\shift-1)) + smlad v1, a3, ip, a2 + smlsd v4, a3, ip, a2 + ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ + smlad v2, a3, lr, a2 + smlsd v3, a3, lr, a2 + smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ + smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ + pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ + pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ + smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ + smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* + Compute final part of IDCT single row without shift. + Input in registers v1--v8 + Output in registers ip, v1--v3, lr, v5--v7 +*/ + .macro idct_finish + add ip, v1, v5 /* a2 = A0 + B0 */ + sub lr, v1, v5 /* a3 = A0 - B0 */ + sub v1, v2, v6 /* a3 = A1 + B1 */ + add v5, v2, v6 /* a3 = A1 - B1 */ + add v2, v3, v7 /* a2 = A2 + B2 */ + sub v6, v3, v7 /* a2 = A2 - B2 */ + add v3, v4, fp /* a3 = A3 + B3 */ + sub v7, v4, fp /* a3 = A3 - B3 */ + .endm + +/* + Compute final part of IDCT single row. + shift = right-shift amount + Input/output in registers v1--v8 +*/ + .macro idct_finish_shift shift + add a4, v1, v5 /* a4 = A0 + B0 */ + sub a3, v1, v5 /* a3 = A0 - B0 */ + mov v1, a4, asr #\shift + mov v5, a3, asr #\shift + + sub a4, v2, v6 /* a4 = A1 + B1 */ + add a3, v2, v6 /* a3 = A1 - B1 */ + mov v2, a4, asr #\shift + mov v6, a3, asr #\shift + + add a4, v3, v7 /* a4 = A2 + B2 */ + sub a3, v3, v7 /* a3 = A2 - B2 */ + mov v3, a4, asr #\shift + mov v7, a3, asr #\shift + + add a4, v4, fp /* a4 = A3 + B3 */ + sub a3, v4, fp /* a3 = A3 - B3 */ + mov v4, a4, asr #\shift + mov fp, a3, asr #\shift + .endm + +/* + Compute final part of IDCT single row, saturating results at 8 bits. + shift = right-shift amount + Input/output in registers v1--v8 +*/ + .macro idct_finish_shift_sat shift + add a4, v1, v5 /* a4 = A0 + B0 */ + sub ip, v1, v5 /* ip = A0 - B0 */ + usat v1, #8, a4, asr #\shift + usat v5, #8, ip, asr #\shift + + sub a4, v2, v6 /* a4 = A1 + B1 */ + add ip, v2, v6 /* ip = A1 - B1 */ + usat v2, #8, a4, asr #\shift + usat v6, #8, ip, asr #\shift + + add a4, v3, v7 /* a4 = A2 + B2 */ + sub ip, v3, v7 /* ip = A2 - B2 */ + usat v3, #8, a4, asr #\shift + usat v7, #8, ip, asr #\shift + + add a4, v4, fp /* a4 = A3 + B3 */ + sub ip, v4, fp /* ip = A3 - B3 */ + usat v4, #8, a4, asr #\shift + usat fp, #8, ip, asr #\shift + .endm + +/* + Compute IDCT of single row, storing as column. + a1 = source + a2 = dest +*/ +function idct_row_armv6 + str lr, [sp, #-4]! + + ldr lr, [a1, #12] /* lr = row[7,5] */ + ldr ip, [a1, #4] /* ip = row[6,4] */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + ldr a3, [a1] /* a3 = row[2,0] */ + orrs lr, lr, ip + cmpeq lr, a4 + cmpeq lr, a3, lsr #16 + beq 1f + str a2, [sp, #-4]! + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + cmp lr, #0 + beq 2f + + idct_row ROW_SHIFT + b 3f + +2: idct_row4 ROW_SHIFT + +3: ldr a2, [sp], #4 + idct_finish_shift ROW_SHIFT + + strh v1, [a2] + strh v2, [a2, #(16*2)] + strh v3, [a2, #(16*4)] + strh v4, [a2, #(16*6)] + strh fp, [a2, #(16*1)] + strh v7, [a2, #(16*3)] + strh v6, [a2, #(16*5)] + strh v5, [a2, #(16*7)] + + ldr pc, [sp], #4 + +1: mov a3, a3, lsl #3 + strh a3, [a2] + strh a3, [a2, #(16*2)] + strh a3, [a2, #(16*4)] + strh a3, [a2, #(16*6)] + strh a3, [a2, #(16*1)] + strh a3, [a2, #(16*3)] + strh a3, [a2, #(16*5)] + strh a3, [a2, #(16*7)] + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row. + a1 = source + a2 = dest +*/ +function idct_col_armv6 + stmfd sp!, {a2, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldr a2, [sp], #4 + idct_finish_shift COL_SHIFT + + strh v1, [a2] + strh v2, [a2, #(16*1)] + strh v3, [a2, #(16*2)] + strh v4, [a2, #(16*3)] + strh fp, [a2, #(16*4)] + strh v7, [a2, #(16*5)] + strh v6, [a2, #(16*6)] + strh v5, [a2, #(16*7)] + + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row, store saturated 8-bit. + a1 = source + a2 = dest + a3 = line size +*/ +function idct_col_put_armv6 + stmfd sp!, {a2, a3, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldmfd sp!, {a2, a3} + idct_finish_shift_sat COL_SHIFT + + strb v1, [a2], a3 + strb v2, [a2], a3 + strb v3, [a2], a3 + strb v4, [a2], a3 + strb fp, [a2], a3 + strb v7, [a2], a3 + strb v6, [a2], a3 + strb v5, [a2], a3 + + sub a2, a2, a3, lsl #3 + + ldr pc, [sp], #4 + .endfunc + +/* + Compute IDCT of single column, read as row, add/store saturated 8-bit. + a1 = source + a2 = dest + a3 = line size +*/ +function idct_col_add_armv6 + stmfd sp!, {a2, a3, lr} + + ldr a3, [a1] /* a3 = row[2,0] */ + ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ + ldr a4, [a1, #8] /* a4 = row[3,1] */ + idct_row COL_SHIFT + ldmfd sp!, {a2, a3} + idct_finish + + ldrb a4, [a2] + ldrb v4, [a2, a3] + ldrb fp, [a2, a3, lsl #2] + add ip, a4, ip, asr #COL_SHIFT + usat ip, #8, ip + add v1, v4, v1, asr #COL_SHIFT + strb ip, [a2], a3 + ldrb ip, [a2, a3] + usat v1, #8, v1 + ldrb fp, [a2, a3, lsl #2] + add v2, ip, v2, asr #COL_SHIFT + usat v2, #8, v2 + strb v1, [a2], a3 + ldrb a4, [a2, a3] + ldrb ip, [a2, a3, lsl #2] + strb v2, [a2], a3 + ldrb v4, [a2, a3] + ldrb v1, [a2, a3, lsl #2] + add v3, a4, v3, asr #COL_SHIFT + usat v3, #8, v3 + add v7, v4, v7, asr #COL_SHIFT + usat v7, #8, v7 + add v6, fp, v6, asr #COL_SHIFT + usat v6, #8, v6 + add v5, ip, v5, asr #COL_SHIFT + usat v5, #8, v5 + add lr, v1, lr, asr #COL_SHIFT + usat lr, #8, lr + strb v3, [a2], a3 + strb v7, [a2], a3 + strb v6, [a2], a3 + strb v5, [a2], a3 + strb lr, [a2], a3 + + sub a2, a2, a3, lsl #3 + + ldr pc, [sp], #4 + .endfunc + +/* + Compute 8 IDCT row transforms. + func = IDCT row->col function + width = width of columns in bytes +*/ + .macro idct_rows func width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + sub a1, a1, #(16*5) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + add a1, a1, #(16*2) + add a2, a2, #\width + bl \func + + sub a1, a1, #(16*7) + .endm + +/* void ff_simple_idct_armv6(DCTELEM *data); */ +function ff_simple_idct_armv6, export=1 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a2, a1 + mov a1, sp + idct_rows idct_col_armv6, 2 + + add sp, sp, #128 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +function ff_simple_idct_add_armv6, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a1, a3 + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a1, sp + ldr a2, [sp, #128] + ldr a3, [sp, #(128+4)] + idct_rows idct_col_add_armv6, 1 + + add sp, sp, #(128+8) + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc + +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +function ff_simple_idct_put_armv6, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + sub sp, sp, #128 + + mov a1, a3 + mov a2, sp + idct_rows idct_row_armv6, 2 + mov a1, sp + ldr a2, [sp, #128] + ldr a3, [sp, #(128+4)] + idct_rows idct_col_put_armv6, 1 + + add sp, sp, #(128+8) + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} + .endfunc diff -r c30b92cf446b -r 9281a8a9387a arm/simple_idct_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/simple_idct_neon.S Wed Dec 17 00:54:54 2008 +0000 @@ -0,0 +1,362 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .fpu neon + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr + .endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1< - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - .macro require8, val=1 - .eabi_attribute 24, \val - .endm - - .macro preserve8, val=1 - .eabi_attribute 25, \val - .endm - - .macro function name, export=0 -.if \export - .global \name -.endif - .type \name, %function - .func \name -\name: - .endm diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_arm.c --- a/armv4l/dsputil_arm.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,217 +0,0 @@ -/* - * ARMv4L optimized DSP utils - * Copyright (c) 2001 Lionel Ulmer. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#ifdef HAVE_IPP -#include -#endif - -void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); -void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); - -void j_rev_dct_ARM(DCTELEM *data); -void simple_idct_ARM(DCTELEM *data); - -void simple_idct_armv5te(DCTELEM *data); -void simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); -void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_simple_idct_armv6(DCTELEM *data); -void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_simple_idct_neon(DCTELEM *data); -void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); - -/* XXX: local hack */ -static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); -static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); - -void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void ff_prefetch_arm(void *mem, int stride, int h); - -CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8) -CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8) -CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8) - -void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest, - int line_size); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - j_rev_dct_ARM (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - j_rev_dct_ARM (block); - ff_add_pixels_clamped(block, dest, line_size); -} -static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - simple_idct_ARM (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - simple_idct_ARM (block); - ff_add_pixels_clamped(block, dest, line_size); -} - -#ifdef HAVE_IPP -static void simple_idct_ipp(DCTELEM *block) -{ - ippiDCT8x8Inv_Video_16s_C1I(block); -} -static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size); -} - -void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size); - -static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ippiDCT8x8Inv_Video_16s_C1I(block); -#ifdef HAVE_IWMMXT - add_pixels_clamped_iwmmxt(block, dest, line_size); -#else - ff_add_pixels_clamped_ARM(block, dest, line_size); -#endif -} -#endif - -int mm_support(void) -{ - return ENABLE_IWMMXT * FF_MM_IWMMXT; -} - -void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx) -{ - int idct_algo= avctx->idct_algo; - - ff_put_pixels_clamped = c->put_pixels_clamped; - ff_add_pixels_clamped = c->add_pixels_clamped; - - if (avctx->lowres == 0) { - if(idct_algo == FF_IDCT_AUTO){ -#if defined(HAVE_IPP) - idct_algo = FF_IDCT_IPP; -#elif defined(HAVE_NEON) - idct_algo = FF_IDCT_SIMPLENEON; -#elif defined(HAVE_ARMV6) - idct_algo = FF_IDCT_SIMPLEARMV6; -#elif defined(HAVE_ARMV5TE) - idct_algo = FF_IDCT_SIMPLEARMV5TE; -#else - idct_algo = FF_IDCT_ARM; -#endif - } - - if(idct_algo==FF_IDCT_ARM){ - c->idct_put= j_rev_dct_ARM_put; - c->idct_add= j_rev_dct_ARM_add; - c->idct = j_rev_dct_ARM; - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; - } else if (idct_algo==FF_IDCT_SIMPLEARM){ - c->idct_put= simple_idct_ARM_put; - c->idct_add= simple_idct_ARM_add; - c->idct = simple_idct_ARM; - c->idct_permutation_type= FF_NO_IDCT_PERM; -#ifdef HAVE_ARMV6 - } else if (idct_algo==FF_IDCT_SIMPLEARMV6){ - c->idct_put= ff_simple_idct_put_armv6; - c->idct_add= ff_simple_idct_add_armv6; - c->idct = ff_simple_idct_armv6; - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; -#endif -#ifdef HAVE_ARMV5TE - } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){ - c->idct_put= simple_idct_put_armv5te; - c->idct_add= simple_idct_add_armv5te; - c->idct = simple_idct_armv5te; - c->idct_permutation_type = FF_NO_IDCT_PERM; -#endif -#ifdef HAVE_IPP - } else if (idct_algo==FF_IDCT_IPP){ - c->idct_put= simple_idct_ipp_put; - c->idct_add= simple_idct_ipp_add; - c->idct = simple_idct_ipp; - c->idct_permutation_type= FF_NO_IDCT_PERM; -#endif -#ifdef HAVE_NEON - } else if (idct_algo==FF_IDCT_SIMPLENEON){ - c->idct_put= ff_simple_idct_put_neon; - c->idct_add= ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; -#endif - } - } - - c->put_pixels_tab[0][0] = put_pixels16_arm; - c->put_pixels_tab[0][1] = put_pixels16_x2_arm; - c->put_pixels_tab[0][2] = put_pixels16_y2_arm; - c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; - c->put_pixels_tab[1][0] = put_pixels8_arm; - c->put_pixels_tab[1][1] = put_pixels8_x2_arm; - c->put_pixels_tab[1][2] = put_pixels8_y2_arm; - c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; - c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm; - -#ifdef HAVE_ARMV5TE - c->prefetch = ff_prefetch_arm; -#endif - -#ifdef HAVE_IWMMXT - dsputil_init_iwmmxt(c, avctx); -#endif -#ifdef HAVE_ARMVFP - ff_float_init_arm_vfp(c, avctx); -#endif -#ifdef HAVE_NEON - ff_dsputil_init_neon(c, avctx); -#endif -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_arm_s.S --- a/armv4l/dsputil_arm_s.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,799 +0,0 @@ -@ -@ ARMv4L optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "asm.S" - - preserve8 - -#ifndef HAVE_PLD -.macro pld reg -.endm -#endif - -#ifdef HAVE_ARMV5TE -function ff_prefetch_arm, export=1 - subs r2, r2, #1 - pld [r0] - add r0, r0, r1 - bne ff_prefetch_arm - bx lr - .endfunc -#endif - -.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -@ ---------------------------------------------------------------- - .align 8 -function put_pixels16_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11, lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - bic r1, r1, #3 - add r5, r5, r4, lsl #2 - ldrne pc, [r5] -1: - ldmia r1, {r4-r7} - add r1, r1, r2 - stmia r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r11, pc} - .align 8 -2: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r11, pc} - .align 8 -3: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r11, pc} - .align 8 -4: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r11,pc} - .align 8 -5: - .word 1b - .word 2b - .word 3b - .word 4b - .endfunc - -@ ---------------------------------------------------------------- - .align 8 -function put_pixels8_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r5,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - bic r1, r1, #3 - add r5, r5, r4, lsl #2 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stmia r0, {r4-r5} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r5,pc} - .align 8 -2: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r5,pc} - .align 8 -3: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r5,pc} - .align 8 -4: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r5,pc} - .align 8 -5: - .word 1b - .word 2b - .word 3b - .word 4b - .endfunc - -@ ---------------------------------------------------------------- - .align 8 -function put_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r10,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r10,pc} - .align 8 -2: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r10,pc} - .align 8 -3: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r10,pc} - .align 8 -4: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - .endfunc - - .align 8 -function put_no_rnd_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r10,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r10,pc} - .align 8 -2: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r10,pc} - .align 8 -3: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r10,pc} - .align 8 -4: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - .endfunc - - -@ ---------------------------------------------------------------- - .align 8 -function put_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - mov r3, r3, lsr #1 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 -6: ldmia r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldmia r1, {r4-r5} - add r1, r1, r2 - stmia r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -2: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -3: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -4: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - .endfunc - - .align 8 -function put_no_rnd_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - mov r3, r3, lsr #1 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 -6: ldmia r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldmia r1, {r4-r5} - add r1, r1, r2 - stmia r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -2: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -3: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -4: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - .endfunc - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldmia r1, {r6-r8} -.elseif \align == 3 - ldmia r1, {r5-r7} -.else - ldmia r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, [r12, #0] @ 0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 - ldreq r14, [r12, #16] @ 0x02020202/0x01010101 - add r8, r8, r10 - add r9, r9, r11 - addeq r8, r8, r14 - addeq r9, r9, r14 - ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - and r6, r14, r6, lsr #2 - and r7, r14, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 - subs r3, r3, #1 -.endm - -.macro RND_XY2_EXPAND align - RND_XY2_IT \align -6: stmfd sp!, {r8-r11} - RND_XY2_IT \align - ldmfd sp!, {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - ldr r14, [r12, #24] @ 0x0F0F0F0F - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - stmia r0, {r4-r5} - add r0, r0, r2 - bge 6b - ldmfd sp!, {r4-r11,pc} -.endm - - .align 8 -function put_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adrl r12, 5f - ands r4, r1, #3 - add r5, r12, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - RND_XY2_EXPAND 0 - - .align 8 -2: - RND_XY2_EXPAND 1 - - .align 8 -3: - RND_XY2_EXPAND 2 - - .align 8 -4: - RND_XY2_EXPAND 3 - -5: - .word 0x03030303 - .word 2b - .word 3b - .word 4b - .word 0x02020202 - .word 0xFCFCFCFC >> 2 - .word 0x0F0F0F0F - .endfunc - - .align 8 -function put_no_rnd_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adrl r12, 5f - ands r4, r1, #3 - add r5, r12, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - RND_XY2_EXPAND 0 - - .align 8 -2: - RND_XY2_EXPAND 1 - - .align 8 -3: - RND_XY2_EXPAND 2 - - .align 8 -4: - RND_XY2_EXPAND 3 - -5: - .word 0x03030303 - .word 2b - .word 3b - .word 4b - .word 0x01010101 - .word 0xFCFCFCFC >> 2 - .word 0x0F0F0F0F - .endfunc - -@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride) -function ff_add_pixels_clamped_ARM, export=1 - push {r4-r10} - mov r10, #8 -1: - ldr r4, [r1] /* load dest */ - /* block[0] and block[1]*/ - ldrsh r5, [r0] - ldrsh r7, [r0, #2] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #4] /* moved form [A] */ - orr r9, r9, r8, lsl #8 - /* block[2] and block[3] */ - /* [A] */ - ldrsh r7, [r0, #6] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - ldr r4, [r1, #4] /* moved form [B] */ - orr r9, r9, r8, lsl #24 - /* store dest */ - ldrsh r5, [r0, #8] /* moved form [C] */ - str r9, [r1] - - /* load dest */ - /* [B] */ - /* block[4] and block[5] */ - /* [C] */ - ldrsh r7, [r0, #10] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #12] /* moved from [D] */ - orr r9, r9, r8, lsl #8 - /* block[6] and block[7] */ - /* [D] */ - ldrsh r7, [r0, #14] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - add r0, r0, #16 /* moved from [E] */ - orr r9, r9, r8, lsl #24 - subs r10, r10, #1 /* moved from [F] */ - /* store dest */ - str r9, [r1, #4] - - /* [E] */ - /* [F] */ - add r1, r1, r2 - bne 1b - - pop {r4-r10} - bx lr - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_iwmmxt.c --- a/armv4l/dsputil_iwmmxt.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * Copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" - -#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2b" -#include "dsputil_iwmmxt_rnd_template.c" -#undef DEF -#undef SET_RND -#undef WAVG2B - -#define DEF(x, y) x ## _ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2br" -#include "dsputil_iwmmxt_rnd_template.c" -#undef DEF -#undef SET_RND -#undef WAVG2BR - -// need scheduling -#define OP(AVG) \ - __asm__ volatile ( \ - /* alignment */ \ - "and r12, %[pixels], #7 \n\t" \ - "bic %[pixels], %[pixels], #7 \n\t" \ - "tmcr wcgr1, r12 \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - \ - "1: \n\t" \ - \ - "wldrd wr2, [%[pixels]] \n\t" \ - "wldrd wr3, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "pld [%[pixels]] \n\t" \ - "walignr1 wr5, wr2, wr3 \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - "pld [%[pixels]] \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "subs %[h], %[h], #2 \n\t" \ - "bne 1b \n\t" \ - : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ - : [line_size]"r"(line_size) \ - : "memory", "r12"); -void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2br"); -} -void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2b"); -} -#undef OP - -void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - uint8_t *pixels2 = pixels + line_size; - - __asm__ volatile ( - "mov r12, #4 \n\t" - "1: \n\t" - "pld [%[pixels], %[line_size2]] \n\t" - "pld [%[pixels2], %[line_size2]] \n\t" - "wldrd wr4, [%[pixels]] \n\t" - "wldrd wr5, [%[pixels2]] \n\t" - "pld [%[block], #32] \n\t" - "wunpckelub wr6, wr4 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wunpckehub wr7, wr4 \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "wunpckelub wr8, wr5 \n\t" - "wldrd wr2, [%[block], #16] \n\t" - "wunpckehub wr9, wr5 \n\t" - "wldrd wr3, [%[block], #24] \n\t" - "add %[block], %[block], #32 \n\t" - "waddhss wr10, wr0, wr6 \n\t" - "waddhss wr11, wr1, wr7 \n\t" - "waddhss wr12, wr2, wr8 \n\t" - "waddhss wr13, wr3, wr9 \n\t" - "wpackhus wr14, wr10, wr11 \n\t" - "wpackhus wr15, wr12, wr13 \n\t" - "wstrd wr14, [%[pixels]] \n\t" - "add %[pixels], %[pixels], %[line_size2] \n\t" - "subs r12, r12, #1 \n\t" - "wstrd wr15, [%[pixels2]] \n\t" - "add %[pixels2], %[pixels2], %[line_size2] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) - : [line_size2]"r"(line_size << 1) - : "cc", "memory", "r12"); -} - -static void clear_blocks_iwmmxt(DCTELEM *blocks) -{ - __asm__ volatile( - "wzero wr0 \n\t" - "mov r1, #(128 * 6 / 32) \n\t" - "1: \n\t" - "wstrd wr0, [%0] \n\t" - "wstrd wr0, [%0, #8] \n\t" - "wstrd wr0, [%0, #16] \n\t" - "wstrd wr0, [%0, #24] \n\t" - "subs r1, r1, #1 \n\t" - "add %0, %0, #32 \n\t" - "bne 1b \n\t" - : "+r"(blocks) - : - : "r1" - ); -} - -static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - return; -} - -/* A run time test is not simple. If this file is compiled in - * then we should install the functions - */ -int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */ - -void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) -{ - if (avctx->dsp_mask) { - if (avctx->dsp_mask & FF_MM_FORCE) - mm_flags |= (avctx->dsp_mask & 0xffff); - else - mm_flags &= ~(avctx->dsp_mask & 0xffff); - } - - if (!(mm_flags & FF_MM_IWMMXT)) return; - - c->add_pixels_clamped = add_pixels_clamped_iwmmxt; - - c->clear_blocks = clear_blocks_iwmmxt; - - c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; - c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; - c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; - - c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; - c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; - c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; - - c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; - - c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; - c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_iwmmxt_rnd_template.c --- a/armv4l/dsputil_iwmmxt_rnd_template.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1114 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wldrd wr2, [r5] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "wldrd wr0, [%[block]] \n\t" - "pld [r4] \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "pld [r4, #32] \n\t" - "wldrd wr2, [r5] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wldrd wr3, [r5, #8] \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr9, wr9, wr1 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - WAVG2B" wr11, wr11, wr3 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wstrd wr0, [%[block]] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wldrd wr13, [r5, #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr1, wr1, wr11 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - WAVG2B" wr3, wr3, wr13 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "pld [%[block]] \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - :"r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "cc", "memory", "r12"); -} - -void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "pld [%[block], #32] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_neon.c --- a/armv4l/dsputil_neon.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,169 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" - -void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); - -void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); - -void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); - -void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); - -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) -{ - c->put_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; - c->put_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; - - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; - - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; - - c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; - c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; - c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; - c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; - c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; - c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; - c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; - c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; - c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; - c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; - c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; - c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; - c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; - c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; - c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; - c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; - - c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; - c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; - c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; - c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; - c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; - c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; - c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; - c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; - c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; - c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; - c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; - c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; - c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; - c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; - c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; - c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; - - c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; - - c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; - c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; - c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; - c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; - - c->h264_idct_add = ff_h264_idct_add_neon; - c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_neon_s.S --- a/armv4l/dsputil_neon_s.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,274 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - .fpu neon - .text - - .macro pixels16 avg=0 -.if \avg - mov ip, r0 -.endif -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 - vld1.64 {d4, d5}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d6, d7}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] -.if \avg - vld1.64 {d16,d17}, [ip], r2 - vrhadd.u8 q0, q0, q8 - vld1.64 {d18,d19}, [ip], r2 - vrhadd.u8 q1, q1, q9 - vld1.64 {d20,d21}, [ip], r2 - vrhadd.u8 q2, q2, q10 - vld1.64 {d22,d23}, [ip], r2 - vrhadd.u8 q3, q3, q11 -.endif - subs r3, r3, #4 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d2, d3}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0-d2}, [r1], r2 - vld1.64 {d4-d6}, [r1], r2 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vext.8 q1, q0, q1, #1 - \vhadd q0, q0, q1 - vext.8 q3, q2, q3, #1 - \vhadd q2, q2, q3 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_y2 vhadd=vrhadd.u8 - push {lr} - add ip, r1, r2 - lsl lr, r2, #1 - vld1.64 {d0, d1}, [r1], lr - vld1.64 {d2, d3}, [ip], lr -1: subs r3, r3, #2 - \vhadd q2, q0, q1 - vld1.64 {d0, d1}, [r1], lr - \vhadd q3, q0, q1 - vld1.64 {d2, d3}, [ip], lr - pld [r1] - pld [ip] - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - pop {pc} - .endm - - .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 - push {lr} - lsl lr, r2, #1 - add ip, r1, r2 - vld1.64 {d0-d2}, [r1], lr - vld1.64 {d4-d6}, [ip], lr -.if \no_rnd - vmov.i16 q13, #1 -.endif - pld [r1] - pld [ip] - vext.8 q1, q0, q1, #1 - vext.8 q3, q2, q3, #1 - vaddl.u8 q8, d0, d2 - vaddl.u8 q10, d1, d3 - vaddl.u8 q9, d4, d6 - vaddl.u8 q11, d5, d7 -1: subs r3, r3, #2 - vld1.64 {d0-d2}, [r1], lr - vadd.u16 q12, q8, q9 - pld [r1] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - \vshrn d28, q12, #2 -.if \no_rnd - vadd.u16 q1, q1, q13 -.endif - \vshrn d29, q1, #2 - vaddl.u8 q8, d0, d30 - vld1.64 {d2-d4}, [ip], lr - vaddl.u8 q10, d1, d31 - vst1.64 {d28,d29}, [r0,:128], r2 - vadd.u16 q12, q8, q9 - pld [ip] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q2, q1, q2, #1 - vadd.u16 q0, q10, q11 - \vshrn d30, q12, #2 -.if \no_rnd - vadd.u16 q0, q0, q13 -.endif - \vshrn d31, q0, #2 - vaddl.u8 q9, d2, d4 - vaddl.u8 q11, d3, d5 - vst1.64 {d30,d31}, [r0,:128], r2 - bgt 1b - pop {pc} - .endm - - .macro pixels8 -1: vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - subs r3, r3, #4 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - vst1.64 {d2}, [r0,:64], r2 - vst1.64 {d3}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0, d1}, [r1], r2 - vext.8 d1, d0, d1, #1 - vld1.64 {d2, d3}, [r1], r2 - vext.8 d3, d2, d3, #1 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vswp d1, d2 - \vhadd q0, q0, q1 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_y2 vhadd=vrhadd.u8 - push {lr} - add ip, r1, r2 - lsl lr, r2, #1 - vld1.64 {d0}, [r1], lr - vld1.64 {d1}, [ip], lr -1: subs r3, r3, #2 - \vhadd d4, d0, d1 - vld1.64 {d0}, [r1], lr - \vhadd d5, d0, d1 - vld1.64 {d1}, [ip], lr - pld [r1] - pld [ip] - vst1.64 {d4}, [r0,:64], r2 - vst1.64 {d5}, [r0,:64], r2 - bne 1b - pop {pc} - .endm - - .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 - push {lr} - lsl lr, r2, #1 - add ip, r1, r2 - vld1.64 {d0, d1}, [r1], lr - vld1.64 {d2, d3}, [ip], lr -.if \no_rnd - vmov.i16 q11, #1 -.endif - pld [r1] - pld [ip] - vext.8 d4, d0, d1, #1 - vext.8 d6, d2, d3, #1 - vaddl.u8 q8, d0, d4 - vaddl.u8 q9, d2, d6 -1: subs r3, r3, #2 - vld1.64 {d0, d1}, [r1], lr - pld [r1] - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vaddl.u8 q8, d0, d4 - \vshrn d5, q10, #2 - vld1.64 {d2, d3}, [ip], lr - vadd.u16 q10, q8, q9 - pld [ip] -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vst1.64 {d5}, [r0,:64], r2 - \vshrn d7, q10, #2 - vext.8 d6, d2, d3, #1 - vaddl.u8 q9, d2, d6 - vst1.64 {d7}, [r0,:64], r2 - bgt 1b - pop {pc} - .endm - - .macro pixfunc pfx name suf rnd_op args:vararg -function ff_\pfx\name\suf\()_neon, export=1 - \name \rnd_op \args - .endfunc - .endm - - .macro pixfunc2 pfx name args:vararg - pixfunc \pfx \name - pixfunc \pfx \name \args - .endm - -function ff_put_h264_qpel16_mc00_neon, export=1 - mov r3, #16 - .endfunc - - pixfunc put_ pixels16 - pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 - -function ff_avg_h264_qpel16_mc00_neon, export=1 - mov r3, #16 - .endfunc - - pixfunc avg_ pixels16,, 1 - -function ff_put_h264_qpel8_mc00_neon, export=1 - mov r3, #8 - .endfunc - - pixfunc put_ pixels8 - pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 diff -r c30b92cf446b -r 9281a8a9387a armv4l/dsputil_vfp.S --- a/armv4l/dsputil_vfp.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - - .fpu neon @ required for gas to accept UAL syntax -/* - * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle - * throughput for almost all the instructions (except for double precision - * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles - * for arithmetic operations. Scheduling code to avoid pipeline stalls is very - * important for performance. One more interesting feature is that VFP has - * independent load/store and arithmetics pipelines, so it is possible to make - * them work simultaneously and get more than 1 operation per cycle. Load/store - * pipeline can process 2 single precision floating point values per cycle and - * supports bulk loads and stores for large sets of registers. Arithmetic operations - * can be done on vectors, which allows to keep the arithmetics pipeline busy, - * while the processor may issue and execute other instructions. Detailed - * optimization manuals can be found at http://www.arm.com - */ - -/** - * ARM VFP optimized implementation of 'vector_fmul_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) -function ff_vector_fmul_vfp, export=1 - vpush {d8-d15} - mov r3, r0 - fmrx r12, fpscr - orr r12, r12, #(3 << 16) /* set vector size to 4 */ - fmxr fpscr, r12 - - vldmia r3!, {s0-s3} - vldmia r1!, {s8-s11} - vldmia r3!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s0, s8 -1: - subs r2, r2, #16 - vmul.f32 s12, s4, s12 - vldmiage r3!, {s16-s19} - vldmiage r1!, {s24-s27} - vldmiage r3!, {s20-s23} - vldmiage r1!, {s28-s31} - vmulge.f32 s24, s16, s24 - vstmia r0!, {s8-s11} - vstmia r0!, {s12-s15} - vmulge.f32 s28, s20, s28 - vldmiagt r3!, {s0-s3} - vldmiagt r1!, {s8-s11} - vldmiagt r3!, {s4-s7} - vldmiagt r1!, {s12-s15} - vmulge.f32 s8, s0, s8 - vstmiage r0!, {s24-s27} - vstmiage r0!, {s28-s31} - bgt 1b - - bic r12, r12, #(7 << 16) /* set vector size back to 1 */ - fmxr fpscr, r12 - vpop {d8-d15} - bx lr - .endfunc - -/** - * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, -@ const float *src1, int len) -function ff_vector_fmul_reverse_vfp, export=1 - vpush {d8-d15} - add r2, r2, r3, lsl #2 - vldmdb r2!, {s0-s3} - vldmia r1!, {s8-s11} - vldmdb r2!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s3, s8 - vmul.f32 s9, s2, s9 - vmul.f32 s10, s1, s10 - vmul.f32 s11, s0, s11 -1: - subs r3, r3, #16 - vldmdbge r2!, {s16-s19} - vmul.f32 s12, s7, s12 - vldmiage r1!, {s24-s27} - vmul.f32 s13, s6, s13 - vldmdbge r2!, {s20-s23} - vmul.f32 s14, s5, s14 - vldmiage r1!, {s28-s31} - vmul.f32 s15, s4, s15 - vmulge.f32 s24, s19, s24 - vldmdbgt r2!, {s0-s3} - vmulge.f32 s25, s18, s25 - vstmia r0!, {s8-s13} - vmulge.f32 s26, s17, s26 - vldmiagt r1!, {s8-s11} - vmulge.f32 s27, s16, s27 - vmulge.f32 s28, s23, s28 - vldmdbgt r2!, {s4-s7} - vmulge.f32 s29, s22, s29 - vstmia r0!, {s14-s15} - vmulge.f32 s30, s21, s30 - vmulge.f32 s31, s20, s31 - vmulge.f32 s8, s3, s8 - vldmiagt r1!, {s12-s15} - vmulge.f32 s9, s2, s9 - vmulge.f32 s10, s1, s10 - vstmiage r0!, {s24-s27} - vmulge.f32 s11, s0, s11 - vstmiage r0!, {s28-s31} - bgt 1b - - vpop {d8-d15} - bx lr - .endfunc - -#ifdef HAVE_ARMV6 -/** - * ARM VFP optimized float to int16 conversion. - * Assume that len is a positive number and is multiple of 8, destination - * buffer is at least 4 bytes aligned (8 bytes alignment is better for - * performance), little endian byte sex - */ -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) -function ff_float_to_int16_vfp, export=1 - push {r4-r8,lr} - vpush {d8-d11} - vldmia r1!, {s16-s23} - vcvt.s32.f32 s0, s16 - vcvt.s32.f32 s1, s17 - vcvt.s32.f32 s2, s18 - vcvt.s32.f32 s3, s19 - vcvt.s32.f32 s4, s20 - vcvt.s32.f32 s5, s21 - vcvt.s32.f32 s6, s22 - vcvt.s32.f32 s7, s23 -1: - subs r2, r2, #8 - vmov r3, r4, s0, s1 - vmov r5, r6, s2, s3 - vmov r7, r8, s4, s5 - vmov ip, lr, s6, s7 - vldmiagt r1!, {s16-s23} - ssat r4, #16, r4 - ssat r3, #16, r3 - ssat r6, #16, r6 - ssat r5, #16, r5 - pkhbt r3, r3, r4, lsl #16 - pkhbt r4, r5, r6, lsl #16 - vcvtgt.s32.f32 s0, s16 - vcvtgt.s32.f32 s1, s17 - vcvtgt.s32.f32 s2, s18 - vcvtgt.s32.f32 s3, s19 - vcvtgt.s32.f32 s4, s20 - vcvtgt.s32.f32 s5, s21 - vcvtgt.s32.f32 s6, s22 - vcvtgt.s32.f32 s7, s23 - ssat r8, #16, r8 - ssat r7, #16, r7 - ssat lr, #16, lr - ssat ip, #16, ip - pkhbt r5, r7, r8, lsl #16 - pkhbt r6, ip, lr, lsl #16 - stmia r0!, {r3-r6} - bgt 1b - - vpop {d8-d11} - pop {r4-r8,pc} - .endfunc -#endif diff -r c30b92cf446b -r 9281a8a9387a armv4l/float_arm_vfp.c --- a/armv4l/float_arm_vfp.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" - -void ff_vector_fmul_vfp(float *dst, const float *src, int len); -void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, - const float *src1, int len); -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); - -void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul = ff_vector_fmul_vfp; - c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -#ifdef HAVE_ARMV6 - c->float_to_int16 = ff_float_to_int16_vfp; -#endif -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/h264dsp_neon.S --- a/armv4l/h264dsp_neon.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1377 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - .fpu neon - - .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r4 - vtrn.32 \r1, \r5 - vtrn.32 \r2, \r6 - vtrn.32 \r3, \r7 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.16 \r4, \r6 - vtrn.16 \r5, \r7 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - vtrn.8 \r4, \r5 - vtrn.8 \r6, \r7 - .endm - - .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 - vswp \r0, \r4 - vswp \r1, \r5 - vswp \r2, \r6 - vswp \r3, \r7 - .endm - - .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r2 - vtrn.32 \r1, \r3 - vtrn.32 \r4, \r6 - vtrn.32 \r5, \r7 - vtrn.16 \r0, \r1 - vtrn.16 \r2, \r3 - vtrn.16 \r4, \r5 - vtrn.16 \r6, \r7 - .endm - -/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc8 avg=0 - push {r4-r7, lr} - ldrd r4, [sp, #20] -.if \avg - mov lr, r0 -.endif - pld [r1] - pld [r1, r2] - - muls r7, r4, r5 - rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - add r5, r1, r2 - - vdup.8 d0, r4 - lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4, d5}, [r1], r4 - vdup.8 d2, r6 - vld1.64 {d6, d7}, [r5], r4 - vdup.8 d3, r7 - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - -1: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r4 - vmlal.u8 q8, d6, d2 - vext.8 d5, d4, d5, #1 - vmlal.u8 q8, d7, d3 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vmlal.u8 q9, d7, d1 - vmlal.u8 q9, d4, d2 - vmlal.u8 q9, d5, d3 - vrshrn.u16 d16, q8, #6 - vld1.64 {d6, d7}, [r5], r4 - pld [r1] - vrshrn.u16 d17, q9, #6 -.if \avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 - vdup.8 d1, ip - - beq 4f - - add r5, r1, r2 - lsl r4, r2, #1 - vld1.64 {d4}, [r1], r4 - vld1.64 {d6}, [r5], r4 - -3: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d1 - vld1.64 {d4}, [r1], r4 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d1 - vld1.64 {d6}, [r5], r4 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 -.if \avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - subs r3, r3, #2 - pld [r1] - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.64 {d4, d5}, [r1], r2 - vld1.64 {d6, d7}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - -5: pld [r1] - subs r3, r3, #2 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r2 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d7, d1 - pld [r1] - vext.8 d5, d4, d5, #1 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 -.if \avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - vld1.64 {d6, d7}, [r1], r2 - vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 5b - - pop {r4-r7, pc} - .endm - -/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc4 avg=0 - push {r4-r7, lr} - ldrd r4, [sp, #20] -.if \avg - mov lr, r0 -.endif - pld [r1] - pld [r1, r2] - - muls r7, r4, r5 - rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - add r5, r1, r2 - - vdup.8 d0, r4 - lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4}, [r1], r4 - vdup.8 d2, r6 - vld1.64 {d6}, [r5], r4 - vdup.8 d3, r7 - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - - vtrn.32 d0, d1 - vtrn.32 d2, d3 - -1: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d2 - vld1.64 {d4}, [r1], r4 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d2 - vld1.64 {d6}, [r5], r4 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - vrshrn.u16 d16, q8, #6 - subs r3, r3, #2 - pld [r1] -.if \avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 - vdup.8 d1, ip - vtrn.32 d0, d1 - - beq 4f - - vext.32 d1, d0, d1, #1 - add r5, r1, r2 - lsl r4, r2, #1 - vld1.32 {d4[0]}, [r1], r4 - vld1.32 {d4[1]}, [r5], r4 - -3: pld [r5] - vmull.u8 q8, d4, d0 - vld1.32 {d4[0]}, [r1], r4 - vmull.u8 q9, d4, d1 - vld1.32 {d4[1]}, [r5], r4 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - vrshrn.u16 d16, q8, #6 -.if \avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - subs r3, r3, #2 - pld [r1] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.64 {d4}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - -5: vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vld1.64 {d4}, [r1], r2 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - pld [r1] - vrshrn.u16 d16, q8, #6 -.if \avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - vld1.64 {d6}, [r1], r2 - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - pld [r1] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 5b - - pop {r4-r7, pc} - .endm - - .text - .align - -function ff_put_h264_chroma_mc8_neon, export=1 - h264_chroma_mc8 - .endfunc - -function ff_avg_h264_chroma_mc8_neon, export=1 - h264_chroma_mc8 avg=1 - .endfunc - -function ff_put_h264_chroma_mc4_neon, export=1 - h264_chroma_mc4 - .endfunc - -function ff_avg_h264_chroma_mc4_neon, export=1 - h264_chroma_mc4 avg=1 - .endfunc - - /* H.264 loop filter */ - - .macro h264_loop_filter_start - ldr ip, [sp] - tst r2, r2 - ldr ip, [ip] - tstne r3, r3 - vmov.32 d24[0], ip - and ip, ip, ip, lsl #16 - bxeq lr - ands ip, ip, ip, lsl #8 - bxlt lr - .endm - - .macro align_push_regs - and ip, sp, #15 - add ip, ip, #32 - sub sp, sp, ip - vst1.64 {d12-d15}, [sp,:128] - sub sp, sp, #32 - vst1.64 {d8-d11}, [sp,:128] - .endm - - .macro align_pop_regs - vld1.64 {d8-d11}, [sp,:128]! - vld1.64 {d12-d15}, [sp,:128], ip - .endm - - .macro h264_loop_filter_luma - vdup.8 q11, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 q6, q8, q0 @ abs(p0 - q0) - vmovl.u16 q12, d24 - vabd.u8 q14, q9, q8 @ abs(p1 - p0) - vsli.16 q12, q12, #8 - vabd.u8 q15, q1, q0 @ abs(q1 - q0) - vsli.32 q12, q12, #16 - vclt.u8 q6, q6, q11 @ < alpha - vdup.8 q11, r3 @ beta - vclt.s8 q7, q12, #0 - vclt.u8 q14, q14, q11 @ < beta - vclt.u8 q15, q15, q11 @ < beta - vbic q6, q6, q7 - vabd.u8 q4, q10, q8 @ abs(p2 - p0) - vand q6, q6, q14 - vabd.u8 q5, q2, q0 @ abs(q2 - q0) - vclt.u8 q4, q4, q11 @ < beta - vand q6, q6, q15 - vclt.u8 q5, q5, q11 @ < beta - vand q4, q4, q6 - vand q5, q5, q6 - vand q12, q12, q6 - vrhadd.u8 q14, q8, q0 - vsub.i8 q6, q12, q4 - vqadd.u8 q7, q9, q12 - vhadd.u8 q10, q10, q14 - vsub.i8 q6, q6, q5 - vhadd.u8 q14, q2, q14 - vmin.u8 q7, q7, q10 - vqsub.u8 q11, q9, q12 - vqadd.u8 q2, q1, q12 - vmax.u8 q7, q7, q11 - vqsub.u8 q11, q1, q12 - vmin.u8 q14, q2, q14 - vmovl.u8 q2, d0 - vmax.u8 q14, q14, q11 - vmovl.u8 q10, d1 - vsubw.u8 q2, q2, d16 - vsubw.u8 q10, q10, d17 - vshl.i16 q2, q2, #2 - vshl.i16 q10, q10, #2 - vaddw.u8 q2, q2, d18 - vaddw.u8 q10, q10, d19 - vsubw.u8 q2, q2, d2 - vsubw.u8 q10, q10, d3 - vrshrn.i16 d4, q2, #3 - vrshrn.i16 d5, q10, #3 - vbsl q4, q7, q9 - vbsl q5, q14, q1 - vneg.s8 q7, q6 - vmovl.u8 q14, d16 - vmin.s8 q2, q2, q6 - vmovl.u8 q6, d17 - vmax.s8 q2, q2, q7 - vmovl.u8 q11, d0 - vmovl.u8 q12, d1 - vaddw.s8 q14, q14, d4 - vaddw.s8 q6, q6, d5 - vsubw.s8 q11, q11, d4 - vsubw.s8 q12, q12, d5 - vqmovun.s16 d16, q14 - vqmovun.s16 d17, q6 - vqmovun.s16 d0, q11 - vqmovun.s16 d1, q12 - .endm - -function ff_h264_v_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - vld1.64 {d0, d1}, [r0,:128], r1 - vld1.64 {d2, d3}, [r0,:128], r1 - vld1.64 {d4, d5}, [r0,:128], r1 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1, lsl #1 - vld1.64 {d20,d21}, [r0,:128], r1 - vld1.64 {d18,d19}, [r0,:128], r1 - vld1.64 {d16,d17}, [r0,:128], r1 - - align_push_regs - - h264_loop_filter_luma - - sub r0, r0, r1, lsl #1 - vst1.64 {d8, d9}, [r0,:128], r1 - vst1.64 {d16,d17}, [r0,:128], r1 - vst1.64 {d0, d1}, [r0,:128], r1 - vst1.64 {d10,d11}, [r0,:128] - - align_pop_regs - bx lr - .endfunc - -function ff_h264_h_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #4 - vld1.64 {d6}, [r0], r1 - vld1.64 {d20}, [r0], r1 - vld1.64 {d18}, [r0], r1 - vld1.64 {d16}, [r0], r1 - vld1.64 {d0}, [r0], r1 - vld1.64 {d2}, [r0], r1 - vld1.64 {d4}, [r0], r1 - vld1.64 {d26}, [r0], r1 - vld1.64 {d7}, [r0], r1 - vld1.64 {d21}, [r0], r1 - vld1.64 {d19}, [r0], r1 - vld1.64 {d17}, [r0], r1 - vld1.64 {d1}, [r0], r1 - vld1.64 {d3}, [r0], r1 - vld1.64 {d5}, [r0], r1 - vld1.64 {d27}, [r0], r1 - - transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 - - align_push_regs - sub sp, sp, #16 - vst1.64 {d4, d5}, [sp,:128] - sub sp, sp, #16 - vst1.64 {d20,d21}, [sp,:128] - - h264_loop_filter_luma - - vld1.64 {d20,d21}, [sp,:128]! - vld1.64 {d4, d5}, [sp,:128]! - - transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13 - - sub r0, r0, r1, lsl #4 - vst1.64 {d6}, [r0], r1 - vst1.64 {d20}, [r0], r1 - vst1.64 {d8}, [r0], r1 - vst1.64 {d16}, [r0], r1 - vst1.64 {d0}, [r0], r1 - vst1.64 {d10}, [r0], r1 - vst1.64 {d4}, [r0], r1 - vst1.64 {d26}, [r0], r1 - vst1.64 {d7}, [r0], r1 - vst1.64 {d21}, [r0], r1 - vst1.64 {d9}, [r0], r1 - vst1.64 {d17}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d11}, [r0], r1 - vst1.64 {d5}, [r0], r1 - vst1.64 {d27}, [r0], r1 - - align_pop_regs - bx lr - .endfunc - - .macro h264_loop_filter_chroma - vdup.8 d22, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 d26, d16, d0 @ abs(p0 - q0) - vmovl.u8 q2, d0 - vabd.u8 d28, d18, d16 @ abs(p1 - p0) - vsubw.u8 q2, q2, d16 - vsli.16 d24, d24, #8 - vshl.i16 q2, q2, #2 - vabd.u8 d30, d2, d0 @ abs(q1 - q0) - vaddw.u8 q2, q2, d18 - vclt.u8 d26, d26, d22 @ < alpha - vsubw.u8 q2, q2, d2 - vdup.8 d22, r3 @ beta - vclt.s8 d25, d24, #0 - vrshrn.i16 d4, q2, #3 - vclt.u8 d28, d28, d22 @ < beta - vbic d26, d26, d25 - vclt.u8 d30, d30, d22 @ < beta - vand d26, d26, d28 - vneg.s8 d25, d24 - vand d26, d26, d30 - vmin.s8 d4, d4, d24 - vmovl.u8 q14, d16 - vand d4, d4, d26 - vmax.s8 d4, d4, d25 - vmovl.u8 q11, d0 - vaddw.s8 q14, q14, d4 - vsubw.s8 q11, q11, d4 - vqmovun.s16 d16, q14 - vqmovun.s16 d0, q11 - .endm - -function ff_h264_v_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, r1, lsl #1 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d0}, [r0,:64], r1 - vld1.64 {d2}, [r0,:64] - - h264_loop_filter_chroma - - sub r0, r0, r1, lsl #1 - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d0}, [r0,:64], r1 - - bx lr - .endfunc - -function ff_h264_h_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #2 - vld1.32 {d18[0]}, [r0], r1 - vld1.32 {d16[0]}, [r0], r1 - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d2[0]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d2[1]}, [r0], r1 - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - h264_loop_filter_chroma - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - sub r0, r0, r1, lsl #3 - vst1.32 {d18[0]}, [r0], r1 - vst1.32 {d16[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d2[0]}, [r0], r1 - vst1.32 {d18[1]}, [r0], r1 - vst1.32 {d16[1]}, [r0], r1 - vst1.32 {d0[1]}, [r0], r1 - vst1.32 {d2[1]}, [r0], r1 - - bx lr - .endfunc - - /* H.264 qpel MC */ - - .macro lowpass_const r - movw \r, #5 - movt \r, #20 - vmov.32 d6[0], \r - .endm - - .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 -.if \narrow - t0 .req q0 - t1 .req q8 -.else - t0 .req \d0 - t1 .req \d1 -.endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vext.8 d18, \r2, \r3, #2 - vmla.i16 t0, q1, d6[1] - vext.8 d19, \r2, \r3, #3 - vaddl.u8 q9, d18, d19 - vext.8 d20, \r2, \r3, #1 - vmls.i16 t0, q2, d6[0] - vext.8 d21, \r2, \r3, #4 - vaddl.u8 q10, d20, d21 - vext.8 d31, \r2, \r3, #5 - vaddl.u8 t1, \r2, d31 - vmla.i16 t1, q9, d6[1] - vmls.i16 t1, q10, d6[0] -.if \narrow - vqrshrun.s16 \d0, t0, #5 - vqrshrun.s16 \d1, t1, #5 -.endif - .unreq t0 - .unreq t1 - .endm - - .macro lowpass_8_1 r0, r1, d0, narrow=1 -.if \narrow - t0 .req q0 -.else - t0 .req \d0 -.endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vmla.i16 t0, q1, d6[1] - vmls.i16 t0, q2, d6[0] -.if \narrow - vqrshrun.s16 \d0, t0, #5 -.endif - .unreq t0 - .endm - - .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d - vext.16 q1, \r0, \r1, #2 - vext.16 q0, \r0, \r1, #3 - vaddl.s16 q9, d2, d0 - vext.16 q2, \r0, \r1, #1 - vaddl.s16 q1, d3, d1 - vext.16 q3, \r0, \r1, #4 - vaddl.s16 q10, d4, d6 - vext.16 \r1, \r0, \r1, #5 - vaddl.s16 q2, d5, d7 - vaddl.s16 q0, \h0, \h1 - vaddl.s16 q8, \l0, \l1 - - vshl.i32 q3, q9, #4 - vshl.i32 q9, q9, #2 - vshl.i32 q15, q10, #2 - vadd.i32 q9, q9, q3 - vadd.i32 q10, q10, q15 - - vshl.i32 q3, q1, #4 - vshl.i32 q1, q1, #2 - vshl.i32 q15, q2, #2 - vadd.i32 q1, q1, q3 - vadd.i32 q2, q2, q15 - - vadd.i32 q9, q9, q8 - vsub.i32 q9, q9, q10 - - vadd.i32 q1, q1, q0 - vsub.i32 q1, q1, q2 - - vrshrn.s32 d18, q9, #10 - vrshrn.s32 d19, q1, #10 - - vqmovun.s16 \d, q9 - .endm - -function put_h264_qpel16_h_lowpass_neon_packed - mov r4, lr - mov ip, #16 - mov r3, #8 - bl put_h264_qpel8_h_lowpass_neon - sub r1, r1, r2, lsl #4 - add r1, r1, #8 - mov ip, #16 - mov lr, r4 - b put_h264_qpel8_h_lowpass_neon - .endfunc - -function put_h264_qpel16_h_lowpass_neon - push {lr} - mov ip, #16 - bl put_h264_qpel8_h_lowpass_neon - sub r0, r0, r3, lsl #4 - sub r1, r1, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - mov ip, #16 - pop {lr} - .endfunc - -function put_h264_qpel8_h_lowpass_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, d0, d16 - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d16}, [r0,:64], r3 - bne 1b - bx lr - .endfunc - -function put_h264_qpel16_h_lowpass_l2_neon - push {lr} - mov ip, #16 - bl put_h264_qpel8_h_lowpass_l2_neon - sub r0, r0, r2, lsl #4 - sub r1, r1, r2, lsl #4 - sub r3, r3, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - add r3, r3, #8 - mov ip, #16 - pop {lr} - .endfunc - -function put_h264_qpel8_h_lowpass_l2_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - vld1.64 {d28}, [r3], r2 - vld1.64 {d29}, [r3], r2 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, d0, d1 - vrhadd.u8 q0, q0, q14 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - bne 1b - bx lr - .endfunc - -function put_h264_qpel16_v_lowpass_neon_packed - mov r4, lr - mov r2, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 - b put_h264_qpel8_v_lowpass_neon - .endfunc - -function put_h264_qpel16_v_lowpass_neon - mov r4, lr - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_v_lowpass_neon - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 - .endfunc - -function put_h264_qpel8_v_lowpass_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d10 - lowpass_8 d12, d13, d14, d15, d12, d14 - lowpass_8 d22, d23, d24, d25, d22, d24 - lowpass_8 d26, d27, d28, d29, d26, d28 - transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 - - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d22}, [r0,:64], r2 - vst1.64 {d24}, [r0,:64], r2 - vst1.64 {d26}, [r0,:64], r2 - vst1.64 {d28}, [r0,:64], r2 - - bx lr - .endfunc - -function put_h264_qpel16_v_lowpass_l2_neon - mov r4, lr - bl put_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_v_lowpass_l2_neon - sub r0, r0, r3, lsl #4 - sub ip, ip, r2, lsl #4 - add r0, r0, #8 - add ip, ip, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl put_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 - .endfunc - -function put_h264_qpel8_v_lowpass_l2_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d9 - lowpass_8 d12, d13, d14, d15, d12, d13 - lowpass_8 d22, d23, d24, d25, d22, d23 - lowpass_8 d26, d27, d28, d29, d26, d27 - transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 - - vld1.64 {d0}, [ip], r2 - vld1.64 {d1}, [ip], r2 - vld1.64 {d2}, [ip], r2 - vld1.64 {d3}, [ip], r2 - vld1.64 {d4}, [ip], r2 - vrhadd.u8 q0, q0, q4 - vld1.64 {d5}, [ip], r2 - vrhadd.u8 q1, q1, q6 - vld1.64 {d10}, [ip], r2 - vrhadd.u8 q2, q2, q11 - vld1.64 {d11}, [ip], r2 - - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d1}, [r0,:64], r3 - vrhadd.u8 q5, q5, q13 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d10}, [r0,:64], r3 - vst1.64 {d11}, [r0,:64], r3 - - bx lr - .endfunc - -function put_h264_qpel8_hv_lowpass_neon_top - lowpass_const ip - mov ip, #12 -1: vld1.64 {d0, d1}, [r1], r3 - vld1.64 {d16,d17}, [r1], r3 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 - vst1.64 {d22-d25}, [r4,:128]! - bne 1b - - vld1.64 {d0, d1}, [r1] - lowpass_8_1 d0, d1, q12, narrow=0 - - mov ip, #-16 - add r4, r4, ip - vld1.64 {d30,d31}, [r4,:128], ip - vld1.64 {d20,d21}, [r4,:128], ip - vld1.64 {d18,d19}, [r4,:128], ip - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d14,d15}, [r4,:128], ip - vld1.64 {d12,d13}, [r4,:128], ip - vld1.64 {d10,d11}, [r4,:128], ip - vld1.64 {d8, d9}, [r4,:128], ip - vld1.64 {d6, d7}, [r4,:128], ip - vld1.64 {d4, d5}, [r4,:128], ip - vld1.64 {d2, d3}, [r4,:128], ip - vld1.64 {d0, d1}, [r4,:128] - - swap4 d1, d3, d5, d7, d8, d10, d12, d14 - transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 - - swap4 d17, d19, d21, d31, d24, d26, d28, d22 - transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 - - vst1.64 {d30,d31}, [r4,:128]! - vst1.64 {d6, d7}, [r4,:128]! - vst1.64 {d20,d21}, [r4,:128]! - vst1.64 {d4, d5}, [r4,:128]! - vst1.64 {d18,d19}, [r4,:128]! - vst1.64 {d2, d3}, [r4,:128]! - vst1.64 {d16,d17}, [r4,:128]! - vst1.64 {d0, d1}, [r4,:128] - - lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 - lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 - lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 - lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 - - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128] - lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 - - transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 - - bx lr - .endfunc - -function put_h264_qpel8_hv_lowpass_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d13}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d15}, [r0,:64], r2 - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d9}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d11}, [r0,:64], r2 - - mov lr, r10 - bx lr - .endfunc - -function put_h264_qpel8_hv_lowpass_l2_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top - - vld1.64 {d0, d1}, [r2,:128]! - vld1.64 {d2, d3}, [r2,:128]! - vrhadd.u8 q0, q0, q6 - vld1.64 {d4, d5}, [r2,:128]! - vrhadd.u8 q1, q1, q7 - vld1.64 {d6, d7}, [r2,:128]! - vrhadd.u8 q2, q2, q4 - - vst1.64 {d0}, [r0,:64], r3 - vrhadd.u8 q3, q3, q5 - vst1.64 {d1}, [r0,:64], r3 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d6}, [r0,:64], r3 - vst1.64 {d7}, [r0,:64], r3 - - mov lr, r10 - bx lr - .endfunc - -function put_h264_qpel16_hv_lowpass_neon - mov r9, lr - bl put_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - bl put_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b put_h264_qpel8_hv_lowpass_neon - .endfunc - -function put_h264_qpel16_hv_lowpass_l2_neon - mov r9, lr - sub r2, r4, #256 - bl put_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r3, lsl #4 - add r0, r0, #8 - bl put_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b put_h264_qpel8_hv_lowpass_l2_neon - .endfunc - -function ff_put_h264_qpel8_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - mov ip, #8 - b put_h264_qpel8_h_lowpass_l2_neon - .endfunc - -function ff_put_h264_qpel8_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - mov ip, #8 - b put_h264_qpel8_h_lowpass_neon - .endfunc - -function ff_put_h264_qpel8_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - mov ip, #8 - b put_h264_qpel8_h_lowpass_l2_neon - .endfunc - -function ff_put_h264_qpel8_mc01_neon, export=1 - push {lr} - mov ip, r1 -put_h264_qpel8_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl put_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - pop {pc} - .endfunc - -function ff_put_h264_qpel8_mc11_neon, export=1 - push {r0, r1, r2, lr} -put_h264_qpel8_mc11: - lowpass_const r3 - sub sp, sp, #64 - mov r0, sp - sub r1, r1, #2 - mov r3, #8 - mov ip, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - ldrd r0, [sp, #128] - mov r3, r2 - add ip, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #8 - bl put_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - add sp, sp, #76 - pop {pc} - .endfunc - -function ff_put_h264_qpel8_mc21_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -put_h264_qpel8_mc21: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(8*8+16*12) - sub r1, r1, #2 - mov r3, #8 - mov r0, sp - mov ip, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub r2, r4, #64 - bl put_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4, r10, r11, pc} - .endfunc - -function ff_put_h264_qpel8_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r2, lr} - sub r1, r1, #1 - b put_h264_qpel8_mc11 - .endfunc - -function ff_put_h264_qpel8_mc02_neon, export=1 - push {lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl put_h264_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {pc} - .endfunc - -function ff_put_h264_qpel8_mc12_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -put_h264_qpel8_mc12: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(8*8+16*12) - sub r1, r1, r2, lsl #1 - mov r3, r2 - mov r2, #8 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel8_v_lowpass_neon - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - sub r2, r4, #64 - bl put_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4, r10, r11, pc} - .endfunc - -function ff_put_h264_qpel8_mc22_neon, export=1 - push {r4, r10, r11, lr} - mov r11, sp - bic sp, sp, #15 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl put_h264_qpel8_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r10, r11, pc} - .endfunc - -function ff_put_h264_qpel8_mc32_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, #1 - b put_h264_qpel8_mc12 - .endfunc - -function ff_put_h264_qpel8_mc03_neon, export=1 - push {lr} - add ip, r1, r2 - b put_h264_qpel8_mc01 - .endfunc - -function ff_put_h264_qpel8_mc13_neon, export=1 - push {r0, r1, r2, lr} - add r1, r1, r2 - b put_h264_qpel8_mc11 - .endfunc - -function ff_put_h264_qpel8_mc23_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, r2 - b put_h264_qpel8_mc21 - .endfunc - -function ff_put_h264_qpel8_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r2, lr} - add r1, r1, r2 - sub r1, r1, #1 - b put_h264_qpel8_mc11 - .endfunc - -function ff_put_h264_qpel16_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - b put_h264_qpel16_h_lowpass_l2_neon - .endfunc - -function ff_put_h264_qpel16_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - b put_h264_qpel16_h_lowpass_neon - .endfunc - -function ff_put_h264_qpel16_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - b put_h264_qpel16_h_lowpass_l2_neon - .endfunc - -function ff_put_h264_qpel16_mc01_neon, export=1 - push {r4, lr} - mov ip, r1 -put_h264_qpel16_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl put_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - pop {r4, pc} - .endfunc - -function ff_put_h264_qpel16_mc11_neon, export=1 - push {r0, r1, r4, lr} -put_h264_qpel16_mc11: - lowpass_const r3 - sub sp, sp, #256 - mov r0, sp - sub r1, r1, #2 - mov r3, #16 - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon - add r0, sp, #256 - ldrd r0, [r0, #64] - mov r3, r2 - add ip, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #16 - bl put_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - add sp, sp, #(256+8) - pop {r4, pc} - .endfunc - -function ff_put_h264_qpel16_mc21_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -put_h264_qpel16_mc21: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(16*16+16*12) - sub r1, r1, #2 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon_packed - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - bl put_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4-r5, r9-r11, pc} - .endfunc - -function ff_put_h264_qpel16_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, lr} - sub r1, r1, #1 - b put_h264_qpel16_mc11 - .endfunc - -function ff_put_h264_qpel16_mc02_neon, export=1 - push {r4, lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl put_h264_qpel16_v_lowpass_neon - vpop {d8-d15} - pop {r4, pc} - .endfunc - -function ff_put_h264_qpel16_mc12_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -put_h264_qpel16_mc12: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(16*16+16*12) - sub r1, r1, r2, lsl #1 - mov r0, sp - mov r3, r2 - vpush {d8-d15} - bl put_h264_qpel16_v_lowpass_neon_packed - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - mov r2, r3 - bl put_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4-r5, r9-r11, pc} - .endfunc - -function ff_put_h264_qpel16_mc22_neon, export=1 - push {r4, r9-r11, lr} - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl put_h264_qpel16_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r9-r11, pc} - .endfunc - -function ff_put_h264_qpel16_mc32_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, #1 - b put_h264_qpel16_mc12 - .endfunc - -function ff_put_h264_qpel16_mc03_neon, export=1 - push {r4, lr} - add ip, r1, r2 - b put_h264_qpel16_mc01 - .endfunc - -function ff_put_h264_qpel16_mc13_neon, export=1 - push {r0, r1, r4, lr} - add r1, r1, r2 - b put_h264_qpel16_mc11 - .endfunc - -function ff_put_h264_qpel16_mc23_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, r2 - b put_h264_qpel16_mc21 - .endfunc - -function ff_put_h264_qpel16_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, lr} - add r1, r1, r2 - sub r1, r1, #1 - b put_h264_qpel16_mc11 - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/h264idct_neon.S --- a/armv4l/h264idct_neon.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - .fpu neon - - .text - -function ff_h264_idct_add_neon, export=1 - mov r3, #(1<<5) - vmov.i16 d16, #0 - vmov.16 d16[0], r3 - vld1.64 {d0-d3}, [r1,:128] - vadd.i16 d0, d0, d16 - - vswp d1, d2 - vadd.i16 d4, d0, d1 - vshr.s16 q8, q1, #1 - vsub.i16 d5, d0, d1 - vadd.i16 d6, d2, d17 - vsub.i16 d7, d16, d3 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vtrn.16 d0, d1 - vtrn.16 d3, d2 - vtrn.32 d0, d3 - vtrn.32 d1, d2 - - vadd.i16 d4, d0, d3 - vld1.32 {d18[0]}, [r0,:32], r2 - vswp d1, d3 - vshr.s16 q8, q1, #1 - vld1.32 {d19[1]}, [r0,:32], r2 - vsub.i16 d5, d0, d1 - vld1.32 {d18[1]}, [r0,:32], r2 - vadd.i16 d6, d16, d3 - vld1.32 {d19[0]}, [r0,:32], r2 - vsub.i16 d7, d2, d17 - sub r0, r0, r2, lsl #2 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vshr.s16 q0, q0, #6 - vshr.s16 q1, q1, #6 - - vaddw.u8 q0, q0, d18 - vaddw.u8 q1, q1, d19 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - - bx lr - .endfunc - -function ff_h264_idct_dc_add_neon, export=1 - vld1.16 {d2[],d3[]}, [r1,:16] - vrshr.s16 q1, q1, #6 - vld1.32 {d0[0]}, [r0,:32], r2 - vld1.32 {d0[1]}, [r0,:32], r2 - vaddw.u8 q2, q1, d0 - vld1.32 {d1[0]}, [r0,:32], r2 - vld1.32 {d1[1]}, [r0,:32], r2 - vaddw.u8 q1, q1, d1 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q1 - sub r0, r0, r2, lsl #2 - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - bx lr - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/jrevdct_arm.S --- a/armv4l/jrevdct_arm.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,388 +0,0 @@ -/* - C-like prototype : - void j_rev_dct_ARM(DCTBLOCK data) - - With DCTBLOCK being a pointer to an array of 64 'signed shorts' - - Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -#include "asm.S" - -#define FIX_0_298631336 2446 -#define FIX_0_541196100 4433 -#define FIX_0_765366865 6270 -#define FIX_1_175875602 9633 -#define FIX_1_501321110 12299 -#define FIX_2_053119869 16819 -#define FIX_3_072711026 25172 -#define FIX_M_0_390180644 -3196 -#define FIX_M_0_899976223 -7373 -#define FIX_M_1_847759065 -15137 -#define FIX_M_1_961570560 -16069 -#define FIX_M_2_562915447 -20995 -#define FIX_0xFFFF 0xFFFF - -#define FIX_0_298631336_ID 0 -#define FIX_0_541196100_ID 4 -#define FIX_0_765366865_ID 8 -#define FIX_1_175875602_ID 12 -#define FIX_1_501321110_ID 16 -#define FIX_2_053119869_ID 20 -#define FIX_3_072711026_ID 24 -#define FIX_M_0_390180644_ID 28 -#define FIX_M_0_899976223_ID 32 -#define FIX_M_1_847759065_ID 36 -#define FIX_M_1_961570560_ID 40 -#define FIX_M_2_562915447_ID 44 -#define FIX_0xFFFF_ID 48 - .text - .align - -function j_rev_dct_ARM, export=1 - stmdb sp!, { r4 - r12, lr } @ all callee saved regs - - sub sp, sp, #4 @ reserve some space on the stack - str r0, [ sp ] @ save the DCT pointer to the stack - - mov lr, r0 @ lr = pointer to the current row - mov r12, #8 @ r12 = row-counter - add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array -row_loop: - ldrsh r0, [lr, # 0] @ r0 = 'd0' - ldrsh r2, [lr, # 2] @ r2 = 'd2' - - @ Optimization for row that have all items except the first set to 0 - @ (this works as the DCTELEMS are always 4-byte aligned) - ldr r5, [lr, # 0] - ldr r6, [lr, # 4] - ldr r3, [lr, # 8] - ldr r4, [lr, #12] - orr r3, r3, r4 - orr r3, r3, r6 - orrs r5, r3, r5 - beq end_of_row_loop @ nothing to be done as ALL of them are '0' - orrs r3, r3, r2 - beq empty_row - - ldrsh r1, [lr, # 8] @ r1 = 'd1' - ldrsh r4, [lr, # 4] @ r4 = 'd4' - ldrsh r6, [lr, # 6] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r7, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r7, r3, r7 @ r7 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r7 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r7 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r3, r6, r3, lsl #13 @ r3 = tmp12 - - stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 - - ldrsh r3, [lr, #10] @ r3 = 'd3' - ldrsh r5, [lr, #12] @ r5 = 'd5' - ldrsh r7, [lr, #14] @ r7 = 'd7' - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 @ r8 = z3 + z4 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) - add r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 0] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) - sub r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #14] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) - add r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 2] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) - sub r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #12] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) - add r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 4] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) - sub r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #10] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) - add r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 6] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) - sub r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 8] - - @ End of row loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - beq start_column_loop - -empty_row: - ldr r1, [r11, #FIX_0xFFFF_ID] - mov r0, r0, lsl #2 - and r0, r0, r1 - add r0, r0, r0, lsl #16 - str r0, [lr, # 0] - str r0, [lr, # 4] - str r0, [lr, # 8] - str r0, [lr, #12] - -end_of_row_loop: - @ End of loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - -start_column_loop: - @ Start of column loop - ldr lr, [ sp ] - mov r12, #8 -column_loop: - ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' - ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' - ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' - ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r1, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r1, r3, r1 @ r1 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r1 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r1 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r6, r6, r3, lsl #13 @ r6 = tmp12 - - ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' - ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' - ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' - ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' - - @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) - orr r9, r1, r3 - orr r10, r5, r7 - orrs r10, r9, r10 - beq empty_odd_column - - stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - add r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 0*8)] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - sub r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - add r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 2*8)] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - sub r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - add r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 4*8)] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - sub r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - add r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 6*8)] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - sub r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - beq the_end - -empty_odd_column: - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - add r0, r0, #(1<<17) - mov r0, r0, asr #18 - strh r0, [lr, #( 0*8)] - strh r0, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - add r4, r4, #(1<<17) - mov r4, r4, asr #18 - strh r4, [lr, #( 2*8)] - strh r4, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - add r6, r6, #(1<<17) - mov r6, r6, asr #18 - strh r6, [lr, #( 4*8)] - strh r6, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - add r2, r2, #(1<<17) - mov r2, r2, asr #18 - strh r2, [lr, #( 6*8)] - strh r2, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - -the_end: - @ The end.... - add sp, sp, #4 - ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return - -const_array: - .align - .word FIX_0_298631336 - .word FIX_0_541196100 - .word FIX_0_765366865 - .word FIX_1_175875602 - .word FIX_1_501321110 - .word FIX_2_053119869 - .word FIX_3_072711026 - .word FIX_M_0_390180644 - .word FIX_M_0_899976223 - .word FIX_M_1_847759065 - .word FIX_M_1_961570560 - .word FIX_M_2_562915447 - .word FIX_0xFFFF diff -r c30b92cf446b -r 9281a8a9387a armv4l/mathops.h --- a/armv4l/mathops.h Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,93 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARMV4L_MATHOPS_H -#define AVCODEC_ARMV4L_MATHOPS_H - -#include -#include "libavutil/common.h" - -# define MULL MULL -static inline av_const int MULL(int a, int b, unsigned shift) -{ - int lo, hi; - __asm__("smull %0, %1, %2, %3 \n\t" - "mov %0, %0, lsr %4 \n\t" - "add %1, %0, %1, lsl %5 \n\t" - : "=&r"(lo), "=&r"(hi) - : "r"(b), "r"(a), "i"(shift), "i"(32-shift)); - return hi; -} - -#define MULH MULH -#ifdef HAVE_ARMV6 -static inline av_const int MULH(int a, int b) -{ - int r; - __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} -#else -static inline av_const int MULH(int a, int b) -{ - int lo, hi; - __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a)); - return hi; -} -#endif - -static inline av_const int64_t MUL64(int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x; - __asm__ ("smull %0, %1, %2, %3" - : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b)); - return x.x; -} -#define MUL64 MUL64 - -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - __asm__ ("smlal %0, %1, %2, %3" - : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b)); - return x.x; -} -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) -#define MLS64(d, a, b) MAC64(d, -(a), b) - -#if defined(HAVE_ARMV5TE) - -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); - -/* signed 16x16 -> 32 multiply */ -# define MUL16 MUL16 -static inline av_const MUL16(int ra, int rb) -{ - int rt; - __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); - return rt; -} - -#endif - -#endif /* AVCODEC_ARMV4L_MATHOPS_H */ diff -r c30b92cf446b -r 9281a8a9387a armv4l/mpegvideo_arm.c --- a/armv4l/mpegvideo_arm.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2002 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -void MPV_common_init_iwmmxt(MpegEncContext *s); -void MPV_common_init_armv5te(MpegEncContext *s); - -void MPV_common_init_armv4l(MpegEncContext *s) -{ - /* IWMMXT support is a superset of armv5te, so - * allow optimized functions for armv5te unless - * a better iwmmxt function exists - */ -#ifdef HAVE_ARMV5TE - MPV_common_init_armv5te(s); -#endif -#ifdef HAVE_IWMMXT - MPV_common_init_iwmmxt(s); -#endif -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/mpegvideo_armv5te.c --- a/armv4l/mpegvideo_armv5te.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count); - -#ifdef ENABLE_ARM_TESTS -/** - * h263 dequantizer supplementary function, it is performance critical and needs to - * have optimized implementations for each architecture. Is also used as a reference - * implementation in regression tests - */ -static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) -{ - int i, level; - for (i = 0; i < count; i++) { - level = block[i]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[i] = level; - } - } -} -#endif - -static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); - block[0] = level; -} - -static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); -} - -void MPV_common_init_armv5te(MpegEncContext *s) -{ - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/mpegvideo_armv5te_s.S --- a/armv4l/mpegvideo_armv5te_s.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - -/* - * Special optimized version of dct_unquantize_h263_helper_c, it - * requires the block to be at least 8 bytes aligned, and may process - * more elements than requested. But it is guaranteed to never - * process more than 64 elements provided that count argument is <= 64, - * so it is safe. This function is optimized for a common distribution - * of values for nCoeffs (they are mostly multiple of 8 plus one or - * two extra elements). So this function processes data as 8 elements - * per loop iteration and contains optional 2 elements processing in - * the end. - * - * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) - */ -function ff_dct_unquantize_h263_armv5te, export=1 - push {r4-r9,lr} - mov ip, #0 - subs r3, r3, #2 - ble 2f - ldrd r4, [r0, #0] -1: - ldrd r6, [r0, #8] - - rsbs r9, ip, r4, asr #16 - addgt r9, r2, #0 - rsblt r9, r2, #0 - smlatbne r9, r4, r1, r9 - - rsbs lr, ip, r5, asr #16 - addgt lr, r2, #0 - rsblt lr, r2, #0 - smlatbne lr, r5, r1, lr - - rsbs r8, ip, r4, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r4, r4, r1, r8 - - rsbs r8, ip, r5, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r5, r5, r1, r8 - - strh r4, [r0], #2 - strh r9, [r0], #2 - strh r5, [r0], #2 - strh lr, [r0], #2 - - rsbs r9, ip, r6, asr #16 - addgt r9, r2, #0 - rsblt r9, r2, #0 - smlatbne r9, r6, r1, r9 - - rsbs lr, ip, r7, asr #16 - addgt lr, r2, #0 - rsblt lr, r2, #0 - smlatbne lr, r7, r1, lr - - rsbs r8, ip, r6, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r6, r6, r1, r8 - - rsbs r8, ip, r7, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r7, r7, r1, r8 - - strh r6, [r0], #2 - strh r9, [r0], #2 - strh r7, [r0], #2 - strh lr, [r0], #2 - - subs r3, r3, #8 - ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ - bgt 1b - - adds r3, r3, #2 - pople {r4-r9,pc} -2: - ldrsh r9, [r0, #0] - ldrsh lr, [r0, #2] - mov r8, r2 - cmp r9, #0 - rsblt r8, r2, #0 - smlabbne r9, r9, r1, r8 - mov r8, r2 - cmp lr, #0 - rsblt r8, r2, #0 - smlabbne lr, lr, r1, r8 - strh r9, [r0], #2 - strh lr, [r0], #2 - pop {r4-r9,pc} - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/mpegvideo_iwmmxt.c --- a/armv4l/mpegvideo_iwmmxt.c Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,119 +0,0 @@ -/* - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - DCTELEM *block_orig = block; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - __asm__ volatile ( -/* "movd %1, %%mm6 \n\t" //qmul */ -/* "packssdw %%mm6, %%mm6 \n\t" */ -/* "packssdw %%mm6, %%mm6 \n\t" */ - "tbcsth wr6, %[qmul] \n\t" -/* "movd %2, %%mm5 \n\t" //qadd */ -/* "packssdw %%mm5, %%mm5 \n\t" */ -/* "packssdw %%mm5, %%mm5 \n\t" */ - "tbcsth wr5, %[qadd] \n\t" - "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ - "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ - "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ - "1: \n\t" - "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ - "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ - "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ - "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ -/* "movq (%0, %3), %%mm2 \n\t" */ -/* "movq 8(%0, %3), %%mm3 \n\t" */ - "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ - "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ - "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ - "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ - "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ - "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ - "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ - "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ - "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ - "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ - "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ - "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ - "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ - "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ - "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ - "subs %[i], %[i], #1 \n\t" - "bne 1b \n\t" /* "jng 1b \n\t" */ - :[block]"+r"(block) - :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) - :"memory"); - - block_orig[0] = level; -} - -#if 0 -static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); -} -#endif - -void MPV_common_init_iwmmxt(MpegEncContext *s) -{ - if (!(mm_flags & FF_MM_IWMMXT)) return; - - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; -#if 0 - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; -#endif -} diff -r c30b92cf446b -r 9281a8a9387a armv4l/simple_idct_arm.S --- a/armv4l/simple_idct_arm.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,486 +0,0 @@ -/* - * simple_idct_arm.S - * Copyright (C) 2002 Frederic 'dilb' Boulay. - * - * Author: Frederic Boulay - * - * The function defined in this file is derived from the simple_idct function - * from the libavcodec library part of the FFmpeg project. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -/* useful constants for the algorithm, they are save in __constant_ptr__ at */ -/* the end of the source code.*/ -#define W1 22725 -#define W2 21407 -#define W3 19266 -#define W4 16383 -#define W5 12873 -#define W6 8867 -#define W7 4520 -#define MASK_MSHW 0xFFFF0000 - -/* offsets of the constants in the vector */ -#define offW1 0 -#define offW2 4 -#define offW3 8 -#define offW4 12 -#define offW5 16 -#define offW6 20 -#define offW7 24 -#define offMASK_MSHW 28 - -#define ROW_SHIFT 11 -#define ROW_SHIFT2MSHW (16-11) -#define COL_SHIFT 20 -#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ -#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ - - - .text - -function simple_idct_ARM, export=1 - @@ void simple_idct_ARM(int16_t *block) - @@ save stack for reg needed (take all of them), - @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block - @@ so it must not be overwritten, if it is not saved!! - @@ R12 is another scratch register, so it should not be saved too - @@ save all registers - stmfd sp!, {r4-r11, r14} @ R14 is also called LR - @@ at this point, R0=block, other registers are free. - add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. - add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it - @@ add 2 temporary variables in the stack: R0 and R14 - sub sp, sp, #8 @ allow 2 local variables - str r0, [sp, #0] @ save block in sp[0] - @@ stack status - @@ sp+4 free - @@ sp+0 R0 (block) - - - @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free - - -__row_loop: - @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) - ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) - ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] - ldr r3, [r14, #8] @ R3=ROWr32[2] - ldr r4, [r14, #12] @ R4=ROWr32[3] - @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), - @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) - @@ else follow the complete algorithm. - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free - orr r5, r4, r3 @ R5=R4 | R3 - orr r5, r5, r2 @ R5=R4 | R3 | R2 - orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) - beq __end_row_loop - mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) - ldrsh r6, [r14, #0] @ R6=ROWr16[0] - orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 - beq __almost_empty_row - -__b_evaluation: - @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], - @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, - @@ R12=__const_ptr_, R14=&block[n] - @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 - - @@ MUL16(b0, W1, row[1]); - @@ MUL16(b1, W3, row[1]); - @@ MUL16(b2, W5, row[1]); - @@ MUL16(b3, W7, row[1]); - @@ MAC16(b0, W3, row[3]); - @@ MAC16(b1, -W7, row[3]); - @@ MAC16(b2, -W1, row[3]); - @@ MAC16(b3, -W5, row[3]); - ldr r8, [r12, #offW1] @ R8=W1 - mov r2, r2, asr #16 @ R2=ROWr16[3] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if null avoid muls - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] - beq __end_b_evaluation - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, row[5]); - @@ MAC16(b2, W7, row[5]); - @@ MAC16(b3, W3, row[5]); - @@ MAC16(b1, -W1, row[5]); - @@ MAC16(b0, W7, row[7]); - @@ MAC16(b2, W3, row[7]); - @@ MAC16(b3, -W1, row[7]); - @@ MAC16(b1, -W5, row[7]); - mov r3, r3, asr #16 @ R3=ROWr16[5] - teq r3, #0 @ if null avoid muls - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 - mov r4, r4, asr #16 @ R4=ROWr16[7] - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5] - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 - @@ R3 is free now - teq r4, #0 @ if null avoid muls - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 - @@ R4 is free now -__end_b_evaluation: - @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), - @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -__a_evaluation: - @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldr r9, [r12, #offW4] @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 - ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) - - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - teq r2, #0 - beq __end_bef_a_evaluation - - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - - - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #8] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - ldrsh r9, [r14, #12] @ R9=ROWr16[6] - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) - -__end_a_evaluation: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ row[0] = (a0 + b0) >> ROW_SHIFT; - @@ row[1] = (a1 + b1) >> ROW_SHIFT; - @@ row[2] = (a2 + b2) >> ROW_SHIFT; - @@ row[3] = (a3 + b3) >> ROW_SHIFT; - @@ row[4] = (a3 - b3) >> ROW_SHIFT; - @@ row[5] = (a2 - b2) >> ROW_SHIFT; - @@ row[6] = (a1 - b1) >> ROW_SHIFT; - @@ row[7] = (a0 - b0) >> ROW_SHIFT; - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - @@ put 2 16 bits half-words in a 32bits word - @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) - ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) - mvn r11, r10 @ R11= NOT R10= 0x0000FFFF - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) - orr r8, r8, r9 - str r8, [r14, #0] - - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) - orr r8, r8, r9 - str r8, [r14, #4] - - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) - orr r8, r8, r9 - str r8, [r14, #8] - - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) - orr r8, r8, r9 - str r8, [r14, #12] - - bal __end_row_loop - -__almost_empty_row: - @@ the row was empty, except ROWr16[0], now, management of this special case - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], - @@ R8=0xFFFF (temp), R9-R11 free - mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). - sub r8, r8, #1 @ R8 is now ready. - and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF - orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) - str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 - str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 - str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 - str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 - -__end_row_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. - sub r14, r14, #16 - bne __row_loop - - - - @@ at this point, R0=block, R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. -__col_loop: - -__b_evaluation2: - @@ at this point, R0=block (temp), R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - @@ proceed with b0-b3 first, followed by a0-a3 - @@ MUL16(b0, W1, col[8x1]); - @@ MUL16(b1, W3, col[8x1]); - @@ MUL16(b2, W5, col[8x1]); - @@ MUL16(b3, W7, col[8x1]); - @@ MAC16(b0, W3, col[8x3]); - @@ MAC16(b1, -W7, col[8x3]); - @@ MAC16(b2, -W1, col[8x3]); - @@ MAC16(b3, -W5, col[8x3]); - ldr r8, [r12, #offW1] @ R8=W1 - ldrsh r7, [r14, #16] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldrsh r2, [r14, #48] - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if 0, then avoid muls - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, col[5x8]); - @@ MAC16(b2, W7, col[5x8]); - @@ MAC16(b3, W3, col[5x8]); - @@ MAC16(b1, -W1, col[5x8]); - @@ MAC16(b0, W7, col[7x8]); - @@ MAC16(b2, W3, col[7x8]); - @@ MAC16(b3, -W1, col[7x8]); - @@ MAC16(b1, -W5, col[7x8]); - ldrsh r3, [r14, #80] @ R3=COLr16[5x8] - teq r3, #0 @ if 0 then avoid muls - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5x8] - ldrsh r4, [r14, #112] @ R4=COLr16[7x8] - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 - @@ R3 is free now - teq r4, #0 @ if 0 then avoid muls - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7x8] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 - @@ R4 is free now -__end_b_evaluation2: - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -__a_evaluation2: - @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldrsh r6, [r14, #0] - ldr r9, [r12, #offW4] @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 - ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #64] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - ldrsh r9, [r14, #96] @ R9=ROWr16[6] - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) -__end_a_evaluation2: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); - @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); - @@ col[16] = ((a2 + b2) >> COL_SHIFT); - @@ col[24] = ((a3 + b3) >> COL_SHIFT); - @@ col[32] = ((a3 - b3) >> COL_SHIFT); - @@ col[40] = ((a2 - b2) >> COL_SHIFT); - @@ col[48] = ((a1 - b1) >> COL_SHIFT); - @@ col[56] = ((a0 - b0) >> COL_SHIFT); - @@@@@ no optimization here @@@@@ - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #0] - strh r9, [r14, #16] - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #32] - strh r9, [r14, #48] - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #64] - strh r9, [r14, #80] - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #96] - strh r9, [r14, #112] - -__end_col_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. - sub r14, r14, #2 - bne __col_loop - - - - -__end_simple_idct_ARM: - @@ restore registers to previous status! - add sp, sp, #8 @@ the local variables! - ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. - - - -@@ kind of sub-function, here not to overload the common case. -__end_bef_a_evaluation: - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - bal __end_a_evaluation - - -__constant_ptr__: @@ see #defines at the beginning of the source code for values. - .align - .word W1 - .word W2 - .word W3 - .word W4 - .word W5 - .word W6 - .word W7 - .word MASK_MSHW diff -r c30b92cf446b -r 9281a8a9387a armv4l/simple_idct_armv5te.S --- a/armv4l/simple_idct_armv5te.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,703 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * Copyright (c) 2006 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - - .text - .align -w13: .long W13 -w26: .long W26 -w57: .long W57 - -function idct_row_armv5te - str lr, [sp, #-4]! - - ldrd v1, [a1, #8] - ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ - orrs v1, v1, v2 - cmpeq v1, a4 - cmpeq v1, a3, lsr #16 - beq row_dc_only - - mov v1, #(1<<(ROW_SHIFT-1)) - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ - smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ - ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ - smultb a2, ip, a4 - smulbb lr, ip, a4 - add v2, v1, a2 - sub v3, v1, a2 - sub v4, v1, lr - add v1, v1, lr - - ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ - ldr lr, [pc, #(w57-.-8)] /* lr = W5 | (W7 << 16) */ - smulbt v5, ip, a3 - smultt v6, lr, a4 - smlatt v5, ip, a4, v5 - smultt a2, ip, a3 - smulbt v7, lr, a3 - sub v6, v6, a2 - smulbt a2, ip, a4 - smultt fp, lr, a3 - sub v7, v7, a2 - smulbt a2, lr, a4 - ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ - sub fp, fp, a2 - - orrs a2, a3, a4 - beq 1f - - smlabt v5, lr, a3, v5 - smlabt v6, ip, a3, v6 - smlatt v5, lr, a4, v5 - smlabt v6, lr, a4, v6 - smlatt v7, lr, a3, v7 - smlatt fp, ip, a3, fp - smulbt a2, ip, a4 - smlatt v7, ip, a4, v7 - sub fp, fp, a2 - - ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ - mov a2, #16384 - sub a2, a2, #1 /* a2 = W4 */ - smulbb a2, a2, a3 /* a2 = W4*row[4] */ - smultb lr, ip, a4 /* lr = W6*row[6] */ - add v1, v1, a2 /* v1 += W4*row[4] */ - add v1, v1, lr /* v1 += W6*row[6] */ - add v4, v4, a2 /* v4 += W4*row[4] */ - sub v4, v4, lr /* v4 -= W6*row[6] */ - smulbb lr, ip, a4 /* lr = W2*row[6] */ - sub v2, v2, a2 /* v2 -= W4*row[4] */ - sub v2, v2, lr /* v2 -= W2*row[6] */ - sub v3, v3, a2 /* v3 -= W4*row[4] */ - add v3, v3, lr /* v3 += W2*row[6] */ - -1: add a2, v1, v5 - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v2, v6 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v3, v7 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - add a2, v4, fp - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1] - - sub a2, v4, fp - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v3, v7 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v2, v6 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - sub a2, v1, v5 - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1, #8] - - ldr pc, [sp], #4 - -row_dc_only: - orr a3, a3, a3, lsl #16 - bic a3, a3, #0xe000 - mov a3, a3, lsl #3 - mov a4, a3 - strd a3, [a1] - strd a3, [a1, #8] - - ldr pc, [sp], #4 - .endfunc - - .macro idct_col - ldr a4, [a1] /* a4 = col[1:0] */ - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ -#if 0 - mov v1, #(1<<(COL_SHIFT-1)) - smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ - smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ - ldr a4, [a1, #(16*4)] -#else - mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ - add v2, v1, a4, asr #16 - rsb v2, v2, v2, lsl #14 - mov a4, a4, lsl #16 - add v1, v1, a4, asr #16 - ldr a4, [a1, #(16*4)] - rsb v1, v1, v1, lsl #14 -#endif - - smulbb lr, ip, a4 - smulbt a3, ip, a4 - sub v3, v1, lr - sub v5, v1, lr - add v7, v1, lr - add v1, v1, lr - sub v4, v2, a3 - sub v6, v2, a3 - add fp, v2, a3 - ldr ip, [pc, #(w26-.-8)] - ldr a4, [a1, #(16*2)] - add v2, v2, a3 - - smulbb lr, ip, a4 - smultb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - add v3, v3, a3 - sub v5, v5, a3 - smulbt lr, ip, a4 - smultt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - add v4, v4, a3 - ldr a4, [a1, #(16*6)] - sub v6, v6, a3 - - smultb lr, ip, a4 - smulbb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - sub v3, v3, a3 - add v5, v5, a3 - smultt lr, ip, a4 - smulbt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - sub v4, v4, a3 - add v6, v6, a3 - - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} - - ldr ip, [pc, #(w13-.-8)] - ldr a4, [a1, #(16*1)] - ldr lr, [pc, #(w57-.-8)] - smulbb v1, ip, a4 - smultb v3, ip, a4 - smulbb v5, lr, a4 - smultb v7, lr, a4 - smulbt v2, ip, a4 - smultt v4, ip, a4 - smulbt v6, lr, a4 - smultt fp, lr, a4 - rsb v4, v4, #0 - ldr a4, [a1, #(16*3)] - rsb v3, v3, #0 - - smlatb v1, ip, a4, v1 - smlatb v3, lr, a4, v3 - smulbb a3, ip, a4 - smulbb a2, lr, a4 - sub v5, v5, a3 - sub v7, v7, a2 - smlatt v2, ip, a4, v2 - smlatt v4, lr, a4, v4 - smulbt a3, ip, a4 - smulbt a2, lr, a4 - sub v6, v6, a3 - ldr a4, [a1, #(16*5)] - sub fp, fp, a2 - - smlabb v1, lr, a4, v1 - smlabb v3, ip, a4, v3 - smlatb v5, lr, a4, v5 - smlatb v7, ip, a4, v7 - smlabt v2, lr, a4, v2 - smlabt v4, ip, a4, v4 - smlatt v6, lr, a4, v6 - ldr a3, [a1, #(16*7)] - smlatt fp, ip, a4, fp - - smlatb v1, lr, a3, v1 - smlabb v3, lr, a3, v3 - smlatb v5, ip, a3, v5 - smulbb a4, ip, a3 - smlatt v2, lr, a3, v2 - sub v7, v7, a4 - smlabt v4, lr, a3, v4 - smulbt a4, ip, a3 - smlatt v6, ip, a3, v6 - sub fp, fp, a4 - .endm - -function idct_col_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - adds a2, a3, v1 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v2 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1] - subs a3, a3, v1 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v2 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*7)] - - subs a2, a3, v3 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - sub ip, a4, v4 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*1)] - adds a3, a3, v3 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - add a4, a4, v4 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*6)] - - adds a2, a3, v5 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v6 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*2)] - subs a3, a3, v5 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v6 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*5)] - - adds a2, a3, v7 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, fp - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*3)] - subs a3, a3, v7 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, fp - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - str a2, [a1, #(16*4)] - - ldr pc, [sp], #4 - .endfunc - -function idct_col_put_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - ldr lr, [sp, #32] - add a2, a3, v1 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v2 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - sub a3, a3, v1 - movs a3, a3, asr #20 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - sub a4, a4, v2 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - ldr v1, [sp, #28] - movgt a4, #255 - strh a2, [v1] - add a2, v1, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - rsb v2, lr, lr, lsl #3 - ldmfd sp!, {a3, a4} - strh a2, [v2, v1]! - - sub a2, a3, v3 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub ip, a4, v4 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - add a3, a3, v3 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add a4, a4, v4 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v5 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v6 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - sub a3, a3, v5 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, v6 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v7 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, fp - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr] - sub a3, a3, v7 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, fp - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - strh a2, [v2, -lr] - - ldr pc, [sp], #4 - .endfunc - -function idct_col_add_armv5te - str lr, [sp, #-4]! - - idct_col - - ldr lr, [sp, #36] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr] - add a2, a3, v1 - mov a2, a2, asr #20 - sub a3, a3, v1 - and v1, ip, #255 - adds a2, a2, v1 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v1, a4, v2 - mov v1, v1, asr #20 - adds v1, v1, ip, lsr #8 - movmi v1, #0 - cmp v1, #255 - movgt v1, #255 - orr a2, a2, v1, lsl #8 - ldr v1, [sp, #32] - sub a4, a4, v2 - rsb v2, v1, v1, lsl #3 - ldrh ip, [v2, lr]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - add a2, lr, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - sub a2, a3, v3 - mov a2, a2, asr #20 - add a3, a3, v3 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub v3, a4, v4 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - add a4, a4, v4 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v5 - mov a2, a2, asr #20 - sub a3, a3, v5 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, v6 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, v6 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v7 - mov a2, a2, asr #20 - sub a3, a3, v7 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, fp - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, fp - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldr pc, [sp], #4 - .endfunc - -function simple_idct_armv5te, export=1 - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - -function simple_idct_add_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - -function simple_idct_put_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/simple_idct_armv6.S --- a/armv4l/simple_idct_armv6.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,433 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * Copyright (c) 2007 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W42 (W4 | (W2 << 16)) -#define W42n (-W4&0xffff | (-W2 << 16)) -#define W46 (W4 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - - .text - .align -w13: .long W13 -w26: .long W26 -w42: .long W42 -w42n: .long W42n -w46: .long W46 -w57: .long W57 - -/* - Compute partial IDCT of single row. - shift = left-shift amount - a1 = source address - a3 = row[2,0] <= 2 cycles - a4 = row[3,1] - ip = w42 <= 2 cycles - - Output in registers v1--v8 -*/ - .macro idct_row shift - ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ - mov a2, #(1<<(\shift-1)) - smlad v1, a3, ip, a2 - smlsd v4, a3, ip, a2 - ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ - ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ - smlad v2, a3, lr, a2 - smlsd v3, a3, lr, a2 - - smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ - smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ - ldr lr, [a1, #12] /* lr = row[7,5] */ - pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ - pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ - smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ - smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ - smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ - - ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ - smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ - ldr a3, [a1, #4] /* a3 = row[6,4] */ - smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ - ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ - smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ - - smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ - smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ - smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ - smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ - .endm - -/* - Compute partial IDCT of half row. - shift = left-shift amount - a3 = row[2,0] - a4 = row[3,1] - ip = w42 - - Output in registers v1--v8 -*/ - .macro idct_row4 shift - ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ - ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ - mov a2, #(1<<(\shift-1)) - smlad v1, a3, ip, a2 - smlsd v4, a3, ip, a2 - ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ - smlad v2, a3, lr, a2 - smlsd v3, a3, lr, a2 - smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ - smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ - pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ - pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ - smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ - smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ - .endm - -/* - Compute final part of IDCT single row without shift. - Input in registers v1--v8 - Output in registers ip, v1--v3, lr, v5--v7 -*/ - .macro idct_finish - add ip, v1, v5 /* a2 = A0 + B0 */ - sub lr, v1, v5 /* a3 = A0 - B0 */ - sub v1, v2, v6 /* a3 = A1 + B1 */ - add v5, v2, v6 /* a3 = A1 - B1 */ - add v2, v3, v7 /* a2 = A2 + B2 */ - sub v6, v3, v7 /* a2 = A2 - B2 */ - add v3, v4, fp /* a3 = A3 + B3 */ - sub v7, v4, fp /* a3 = A3 - B3 */ - .endm - -/* - Compute final part of IDCT single row. - shift = right-shift amount - Input/output in registers v1--v8 -*/ - .macro idct_finish_shift shift - add a4, v1, v5 /* a4 = A0 + B0 */ - sub a3, v1, v5 /* a3 = A0 - B0 */ - mov v1, a4, asr #\shift - mov v5, a3, asr #\shift - - sub a4, v2, v6 /* a4 = A1 + B1 */ - add a3, v2, v6 /* a3 = A1 - B1 */ - mov v2, a4, asr #\shift - mov v6, a3, asr #\shift - - add a4, v3, v7 /* a4 = A2 + B2 */ - sub a3, v3, v7 /* a3 = A2 - B2 */ - mov v3, a4, asr #\shift - mov v7, a3, asr #\shift - - add a4, v4, fp /* a4 = A3 + B3 */ - sub a3, v4, fp /* a3 = A3 - B3 */ - mov v4, a4, asr #\shift - mov fp, a3, asr #\shift - .endm - -/* - Compute final part of IDCT single row, saturating results at 8 bits. - shift = right-shift amount - Input/output in registers v1--v8 -*/ - .macro idct_finish_shift_sat shift - add a4, v1, v5 /* a4 = A0 + B0 */ - sub ip, v1, v5 /* ip = A0 - B0 */ - usat v1, #8, a4, asr #\shift - usat v5, #8, ip, asr #\shift - - sub a4, v2, v6 /* a4 = A1 + B1 */ - add ip, v2, v6 /* ip = A1 - B1 */ - usat v2, #8, a4, asr #\shift - usat v6, #8, ip, asr #\shift - - add a4, v3, v7 /* a4 = A2 + B2 */ - sub ip, v3, v7 /* ip = A2 - B2 */ - usat v3, #8, a4, asr #\shift - usat v7, #8, ip, asr #\shift - - add a4, v4, fp /* a4 = A3 + B3 */ - sub ip, v4, fp /* ip = A3 - B3 */ - usat v4, #8, a4, asr #\shift - usat fp, #8, ip, asr #\shift - .endm - -/* - Compute IDCT of single row, storing as column. - a1 = source - a2 = dest -*/ -function idct_row_armv6 - str lr, [sp, #-4]! - - ldr lr, [a1, #12] /* lr = row[7,5] */ - ldr ip, [a1, #4] /* ip = row[6,4] */ - ldr a4, [a1, #8] /* a4 = row[3,1] */ - ldr a3, [a1] /* a3 = row[2,0] */ - orrs lr, lr, ip - cmpeq lr, a4 - cmpeq lr, a3, lsr #16 - beq 1f - str a2, [sp, #-4]! - ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ - cmp lr, #0 - beq 2f - - idct_row ROW_SHIFT - b 3f - -2: idct_row4 ROW_SHIFT - -3: ldr a2, [sp], #4 - idct_finish_shift ROW_SHIFT - - strh v1, [a2] - strh v2, [a2, #(16*2)] - strh v3, [a2, #(16*4)] - strh v4, [a2, #(16*6)] - strh fp, [a2, #(16*1)] - strh v7, [a2, #(16*3)] - strh v6, [a2, #(16*5)] - strh v5, [a2, #(16*7)] - - ldr pc, [sp], #4 - -1: mov a3, a3, lsl #3 - strh a3, [a2] - strh a3, [a2, #(16*2)] - strh a3, [a2, #(16*4)] - strh a3, [a2, #(16*6)] - strh a3, [a2, #(16*1)] - strh a3, [a2, #(16*3)] - strh a3, [a2, #(16*5)] - strh a3, [a2, #(16*7)] - ldr pc, [sp], #4 - .endfunc - -/* - Compute IDCT of single column, read as row. - a1 = source - a2 = dest -*/ -function idct_col_armv6 - stmfd sp!, {a2, lr} - - ldr a3, [a1] /* a3 = row[2,0] */ - ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ - ldr a4, [a1, #8] /* a4 = row[3,1] */ - idct_row COL_SHIFT - ldr a2, [sp], #4 - idct_finish_shift COL_SHIFT - - strh v1, [a2] - strh v2, [a2, #(16*1)] - strh v3, [a2, #(16*2)] - strh v4, [a2, #(16*3)] - strh fp, [a2, #(16*4)] - strh v7, [a2, #(16*5)] - strh v6, [a2, #(16*6)] - strh v5, [a2, #(16*7)] - - ldr pc, [sp], #4 - .endfunc - -/* - Compute IDCT of single column, read as row, store saturated 8-bit. - a1 = source - a2 = dest - a3 = line size -*/ -function idct_col_put_armv6 - stmfd sp!, {a2, a3, lr} - - ldr a3, [a1] /* a3 = row[2,0] */ - ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ - ldr a4, [a1, #8] /* a4 = row[3,1] */ - idct_row COL_SHIFT - ldmfd sp!, {a2, a3} - idct_finish_shift_sat COL_SHIFT - - strb v1, [a2], a3 - strb v2, [a2], a3 - strb v3, [a2], a3 - strb v4, [a2], a3 - strb fp, [a2], a3 - strb v7, [a2], a3 - strb v6, [a2], a3 - strb v5, [a2], a3 - - sub a2, a2, a3, lsl #3 - - ldr pc, [sp], #4 - .endfunc - -/* - Compute IDCT of single column, read as row, add/store saturated 8-bit. - a1 = source - a2 = dest - a3 = line size -*/ -function idct_col_add_armv6 - stmfd sp!, {a2, a3, lr} - - ldr a3, [a1] /* a3 = row[2,0] */ - ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ - ldr a4, [a1, #8] /* a4 = row[3,1] */ - idct_row COL_SHIFT - ldmfd sp!, {a2, a3} - idct_finish - - ldrb a4, [a2] - ldrb v4, [a2, a3] - ldrb fp, [a2, a3, lsl #2] - add ip, a4, ip, asr #COL_SHIFT - usat ip, #8, ip - add v1, v4, v1, asr #COL_SHIFT - strb ip, [a2], a3 - ldrb ip, [a2, a3] - usat v1, #8, v1 - ldrb fp, [a2, a3, lsl #2] - add v2, ip, v2, asr #COL_SHIFT - usat v2, #8, v2 - strb v1, [a2], a3 - ldrb a4, [a2, a3] - ldrb ip, [a2, a3, lsl #2] - strb v2, [a2], a3 - ldrb v4, [a2, a3] - ldrb v1, [a2, a3, lsl #2] - add v3, a4, v3, asr #COL_SHIFT - usat v3, #8, v3 - add v7, v4, v7, asr #COL_SHIFT - usat v7, #8, v7 - add v6, fp, v6, asr #COL_SHIFT - usat v6, #8, v6 - add v5, ip, v5, asr #COL_SHIFT - usat v5, #8, v5 - add lr, v1, lr, asr #COL_SHIFT - usat lr, #8, lr - strb v3, [a2], a3 - strb v7, [a2], a3 - strb v6, [a2], a3 - strb v5, [a2], a3 - strb lr, [a2], a3 - - sub a2, a2, a3, lsl #3 - - ldr pc, [sp], #4 - .endfunc - -/* - Compute 8 IDCT row transforms. - func = IDCT row->col function - width = width of columns in bytes -*/ - .macro idct_rows func width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - sub a1, a1, #(16*5) - add a2, a2, #\width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - add a1, a1, #(16*2) - add a2, a2, #\width - bl \func - - sub a1, a1, #(16*7) - .endm - -/* void ff_simple_idct_armv6(DCTELEM *data); */ -function ff_simple_idct_armv6, export=1 - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} - sub sp, sp, #128 - - mov a2, sp - idct_rows idct_row_armv6, 2 - mov a2, a1 - mov a1, sp - idct_rows idct_col_armv6, 2 - - add sp, sp, #128 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - -/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ -function ff_simple_idct_add_armv6, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - sub sp, sp, #128 - - mov a1, a3 - mov a2, sp - idct_rows idct_row_armv6, 2 - mov a1, sp - ldr a2, [sp, #128] - ldr a3, [sp, #(128+4)] - idct_rows idct_col_add_armv6, 1 - - add sp, sp, #(128+8) - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - -/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ -function ff_simple_idct_put_armv6, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - sub sp, sp, #128 - - mov a1, a3 - mov a2, sp - idct_rows idct_row_armv6, 2 - mov a1, sp - ldr a2, [sp, #128] - ldr a3, [sp, #(128+4)] - idct_rows idct_col_put_armv6, 1 - - add sp, sp, #(128+8) - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc diff -r c30b92cf446b -r 9281a8a9387a armv4l/simple_idct_neon.S --- a/armv4l/simple_idct_neon.S Wed Dec 17 00:39:45 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,362 +0,0 @@ -/* - * ARM NEON IDCT - * - * Copyright (c) 2008 Mans Rullgard - * - * Based on Simple IDCT - * Copyright (c) 2001 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4c ((1<<(COL_SHIFT-1))/W4) -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define w1 d0[0] -#define w2 d0[1] -#define w3 d0[2] -#define w4 d0[3] -#define w5 d1[0] -#define w6 d1[1] -#define w7 d1[2] -#define w4c d1[3] - - .fpu neon - - .macro idct_col4_top - vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ - vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ - vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ - vadd.i32 q11, q15, q7 - vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ - vadd.i32 q12, q15, q8 - vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ - vsub.i32 q13, q15, q8 - vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ - vsub.i32 q14, q15, q7 - - vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ - vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ - vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ - vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ - .endm - - .text - .align 6 - -function idct_row4_neon - vmov.i32 q15, #(1<<(ROW_SHIFT-1)) - vld1.64 {d2-d5}, [r2,:128]! - vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ - vld1.64 {d6,d7}, [r2,:128]! - vorr d10, d3, d5 - vld1.64 {d8,d9}, [r2,:128]! - add r2, r2, #-64 - - vorr d11, d7, d9 - vorr d10, d10, d11 - vmov r3, r4, d10 - - idct_col4_top - - orrs r3, r3, r4 - beq 1f - - vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ - vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ - vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ - vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ - vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q7 - vsub.i32 q13, q13, q7 - vadd.i32 q14, q14, q7 - vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ - vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ - vmlal.s16 q9, d9, w7 - vmlsl.s16 q10, d9, w5 - vmlal.s16 q5, d9, w3 - vmlsl.s16 q6, d9, w1 - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q8 - vadd.i32 q13, q13, q8 - vsub.i32 q14, q14, q7 - -1: vadd.i32 q3, q11, q9 - vadd.i32 q4, q12, q10 - vshrn.i32 d2, q3, #ROW_SHIFT - vshrn.i32 d4, q4, #ROW_SHIFT - vadd.i32 q7, q13, q5 - vadd.i32 q8, q14, q6 - vtrn.16 d2, d4 - vshrn.i32 d6, q7, #ROW_SHIFT - vshrn.i32 d8, q8, #ROW_SHIFT - vsub.i32 q14, q14, q6 - vsub.i32 q11, q11, q9 - vtrn.16 d6, d8 - vsub.i32 q13, q13, q5 - vshrn.i32 d3, q14, #ROW_SHIFT - vtrn.32 d2, d6 - vsub.i32 q12, q12, q10 - vtrn.32 d4, d8 - vshrn.i32 d5, q13, #ROW_SHIFT - vshrn.i32 d7, q12, #ROW_SHIFT - vshrn.i32 d9, q11, #ROW_SHIFT - - vtrn.16 d3, d5 - vtrn.16 d7, d9 - vtrn.32 d3, d7 - vtrn.32 d5, d9 - - vst1.64 {d2-d5}, [r2,:128]! - vst1.64 {d6-d9}, [r2,:128]! - - bx lr - .endfunc - -function idct_col4_neon - mov ip, #16 - vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ - vdup.16 d30, w4c - vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ - vadd.i16 d30, d30, d2 - vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ - vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); if (ENABLE_MMX) dsputil_init_mmx (c, avctx); - if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx); + if (ENABLE_ARM) dsputil_init_arm (c, avctx); if (ENABLE_MLIB) dsputil_init_mlib (c, avctx); if (ENABLE_VIS) dsputil_init_vis (c, avctx); if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx); diff -r c30b92cf446b -r 9281a8a9387a dsputil.h --- a/dsputil.h Wed Dec 17 00:39:45 2008 +0000 +++ b/dsputil.h Wed Dec 17 00:54:54 2008 +0000 @@ -558,7 +558,7 @@ int mm_support(void); void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx); -void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx); void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx); void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx); void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx); @@ -593,7 +593,7 @@ void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); -#elif defined(ARCH_ARMV4L) +#elif defined(ARCH_ARM) extern int mm_flags; diff -r c30b92cf446b -r 9281a8a9387a mathops.h --- a/mathops.h Wed Dec 17 00:39:45 2008 +0000 +++ b/mathops.h Wed Dec 17 00:54:54 2008 +0000 @@ -28,9 +28,9 @@ #include "i386/mathops.h" -#elif defined(ARCH_ARMV4L) +#elif defined(ARCH_ARM) -#include "armv4l/mathops.h" +#include "arm/mathops.h" #elif defined(ARCH_POWERPC) diff -r c30b92cf446b -r 9281a8a9387a mpegvideo.c --- a/mpegvideo.c Wed Dec 17 00:39:45 2008 +0000 +++ b/mpegvideo.c Wed Dec 17 00:54:54 2008 +0000 @@ -129,8 +129,8 @@ MPV_common_init_mlib(s); #elif defined(HAVE_MMI) MPV_common_init_mmi(s); -#elif defined(ARCH_ARMV4L) - MPV_common_init_armv4l(s); +#elif defined(ARCH_ARM) + MPV_common_init_arm(s); #elif defined(HAVE_ALTIVEC) MPV_common_init_altivec(s); #elif defined(ARCH_BFIN) diff -r c30b92cf446b -r 9281a8a9387a mpegvideo.h --- a/mpegvideo.h Wed Dec 17 00:39:45 2008 +0000 +++ b/mpegvideo.h Wed Dec 17 00:54:54 2008 +0000 @@ -684,7 +684,7 @@ void MPV_common_init_axp(MpegEncContext *s); void MPV_common_init_mlib(MpegEncContext *s); void MPV_common_init_mmi(MpegEncContext *s); -void MPV_common_init_armv4l(MpegEncContext *s); +void MPV_common_init_arm(MpegEncContext *s); void MPV_common_init_altivec(MpegEncContext *s); void ff_clean_intra_table_entries(MpegEncContext *s); void ff_draw_horiz_band(MpegEncContext *s, int y, int h);