# HG changeset patch # User mru # Date 1254661982 0 # Node ID b72bb442a775a1900ae57e869b95c3856bfc9e8e # Parent 48be79afc72dbfa31d5e63ce72068e21c0867085 ARM: clean up file/function naming conventions diff -r 48be79afc72d -r b72bb442a775 Makefile --- a/Makefile Sun Oct 04 13:12:55 2009 +0000 +++ b/Makefile Sun Oct 04 13:13:02 2009 +0000 @@ -480,8 +480,8 @@ alpha/mpegvideo_alpha.o \ alpha/simple_idct_alpha.o \ -OBJS-$(ARCH_ARM) += arm/dsputil_arm.o \ - arm/dsputil_arm_s.o \ +OBJS-$(ARCH_ARM) += arm/dsputil_init_arm.o \ + arm/dsputil_arm.o \ arm/fft_init_arm.o \ arm/jrevdct_arm.o \ arm/mpegvideo_arm.o \ @@ -496,7 +496,7 @@ arm/simple_idct_armv6.o \ OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ - arm/float_arm_vfp.o \ + arm/dsputil_init_vfp.o \ OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ arm/mpegvideo_iwmmxt.o \ @@ -510,8 +510,8 @@ NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o -OBJS-$(HAVE_NEON) += arm/dsputil_neon.o \ - arm/dsputil_neon_s.o \ +OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ + arm/dsputil_neon.o \ arm/simple_idct_neon.o \ $(NEON-OBJS-yes) diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_arm.S Sun Oct 04 13:13:02 2009 +0000 @@ -0,0 +1,726 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "asm.S" + + preserve8 + +#if !HAVE_PLD +.macro pld reg +.endm +#endif + +#if HAVE_ARMV5TE +function ff_prefetch_arm, export=1 + subs r2, r2, #1 + pld [r0] + add r0, r0, r1 + bne ff_prefetch_arm + bx lr + .endfunc +#endif + +.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro JMP_ALIGN tmp, reg + ands \tmp, \reg, #3 + bic \reg, \reg, #3 + beq 1f + subs \tmp, \tmp, #1 + beq 2f + subs \tmp, \tmp, #1 + beq 3f + b 4f +.endm + +@ ---------------------------------------------------------------- + .align 5 +function put_pixels16_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11, lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r7} + add r1, r1, r2 + stm r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + pop {r4-r11, pc} + .align 5 +2: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 2b + pop {r4-r11, pc} + .align 5 +3: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 3b + pop {r4-r11, pc} + .align 5 +4: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 4b + pop {r4-r11,pc} + .endfunc + +@ ---------------------------------------------------------------- + .align 5 +function put_pixels8_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r5,lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stm r0, {r4-r5} + add r0, r0, r2 + bne 1b + pop {r4-r5,pc} + .align 5 +2: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r5,pc} + .align 5 +3: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r5,pc} + .align 5 +4: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 4b + pop {r4-r5,pc} + .endfunc + +@ ---------------------------------------------------------------- + .align 5 +function put_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} + .endfunc + + .align 5 +function put_no_rnd_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} + .endfunc + + +@ ---------------------------------------------------------------- + .align 5 +function put_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .endfunc + + .align 5 +function put_no_rnd_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .endfunc + + .ltorg + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align, rnd + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldm r1, {r6-r8} +.elseif \align == 3 + ldm r1, {r5-r7} +.else + ldm r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, =0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 + andeq r14, r14, r14, \rnd #1 + add r8, r8, r10 + add r9, r9, r11 + ldr r12, =0xfcfcfcfc >> 2 + addeq r8, r8, r14 + addeq r9, r9, r14 + and r4, r12, r4, lsr #2 + and r5, r12, r5, lsr #2 + and r6, r12, r6, lsr #2 + and r7, r12, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 + subs r3, r3, #1 +.endm + +.macro RND_XY2_EXPAND align, rnd + RND_XY2_IT \align, \rnd +6: push {r8-r11} + RND_XY2_IT \align, \rnd + pop {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + ldr r14, =0x0f0f0f0f + add r6, r6, r10 + add r7, r7, r11 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + stm r0, {r4-r5} + add r0, r0, r2 + bge 6b + pop {r4-r11,pc} +.endm + + .align 5 +function put_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} @ R14 is also called LR + JMP_ALIGN r5, r1 +1: + RND_XY2_EXPAND 0, lsl + + .align 5 +2: + RND_XY2_EXPAND 1, lsl + + .align 5 +3: + RND_XY2_EXPAND 2, lsl + + .align 5 +4: + RND_XY2_EXPAND 3, lsl + .endfunc + + .align 5 +function put_no_rnd_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + JMP_ALIGN r5, r1 +1: + RND_XY2_EXPAND 0, lsr + + .align 5 +2: + RND_XY2_EXPAND 1, lsr + + .align 5 +3: + RND_XY2_EXPAND 2, lsr + + .align 5 +4: + RND_XY2_EXPAND 3, lsr + .endfunc + + .align 5 +@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride) +function ff_add_pixels_clamped_ARM, export=1 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr + .endfunc diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_arm.c --- a/arm/dsputil_arm.c Sun Oct 04 13:12:55 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -/* - * ARM optimized DSP utils - * Copyright (c) 2001 Lionel Ulmer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void j_rev_dct_ARM(DCTELEM *data); -void simple_idct_ARM(DCTELEM *data); - -/* XXX: local hack */ -static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); -static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); - -void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8) -CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8) -CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8) -CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8) - -void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest, - int line_size); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - j_rev_dct_ARM (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - j_rev_dct_ARM (block); - ff_add_pixels_clamped(block, dest, line_size); -} -static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - simple_idct_ARM (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - simple_idct_ARM (block); - ff_add_pixels_clamped(block, dest, line_size); -} - -int mm_support(void) -{ - return HAVE_IWMMXT * FF_MM_IWMMXT; -} - -void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx) -{ - ff_put_pixels_clamped = c->put_pixels_clamped; - ff_add_pixels_clamped = c->add_pixels_clamped; - - if (avctx->lowres == 0) { - if(avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_ARM){ - c->idct_put= j_rev_dct_ARM_put; - c->idct_add= j_rev_dct_ARM_add; - c->idct = j_rev_dct_ARM; - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; - } else if (avctx->idct_algo==FF_IDCT_SIMPLEARM){ - c->idct_put= simple_idct_ARM_put; - c->idct_add= simple_idct_ARM_add; - c->idct = simple_idct_ARM; - c->idct_permutation_type= FF_NO_IDCT_PERM; - } - } - - c->put_pixels_tab[0][0] = put_pixels16_arm; - c->put_pixels_tab[0][1] = put_pixels16_x2_arm; - c->put_pixels_tab[0][2] = put_pixels16_y2_arm; - c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; - c->put_pixels_tab[1][0] = put_pixels8_arm; - c->put_pixels_tab[1][1] = put_pixels8_x2_arm; - c->put_pixels_tab[1][2] = put_pixels8_y2_arm; - c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; - c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm; - - if (HAVE_ARMV5TE) ff_dsputil_init_armv5te(c, avctx); - if (HAVE_ARMV6) ff_dsputil_init_armv6(c, avctx); - -#if HAVE_IWMMXT - dsputil_init_iwmmxt(c, avctx); -#endif -#if HAVE_ARMVFP - ff_float_init_arm_vfp(c, avctx); -#endif -#if HAVE_NEON - ff_dsputil_init_neon(c, avctx); -#endif -} diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_arm.h --- a/arm/dsputil_arm.h Sun Oct 04 13:12:55 2009 +0000 +++ b/arm/dsputil_arm.h Sun Oct 04 13:13:02 2009 +0000 @@ -23,7 +23,7 @@ void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); -void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_arm_s.S --- a/arm/dsputil_arm_s.S Sun Oct 04 13:12:55 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,726 +0,0 @@ -@ -@ ARMv4 optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "asm.S" - - preserve8 - -#if !HAVE_PLD -.macro pld reg -.endm -#endif - -#if HAVE_ARMV5TE -function ff_prefetch_arm, export=1 - subs r2, r2, #1 - pld [r0] - add r0, r0, r1 - bne ff_prefetch_arm - bx lr - .endfunc -#endif - -.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro JMP_ALIGN tmp, reg - ands \tmp, \reg, #3 - bic \reg, \reg, #3 - beq 1f - subs \tmp, \tmp, #1 - beq 2f - subs \tmp, \tmp, #1 - beq 3f - b 4f -.endm - -@ ---------------------------------------------------------------- - .align 5 -function put_pixels16_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11, lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r7} - add r1, r1, r2 - stm r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - pop {r4-r11, pc} - .align 5 -2: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 2b - pop {r4-r11, pc} - .align 5 -3: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 3b - pop {r4-r11, pc} - .align 5 -4: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 4b - pop {r4-r11,pc} - .endfunc - -@ ---------------------------------------------------------------- - .align 5 -function put_pixels8_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r5,lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stm r0, {r4-r5} - add r0, r0, r2 - bne 1b - pop {r4-r5,pc} - .align 5 -2: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r5,pc} - .align 5 -3: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r5,pc} - .align 5 -4: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 4b - pop {r4-r5,pc} - .endfunc - -@ ---------------------------------------------------------------- - .align 5 -function put_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} - .endfunc - - .align 5 -function put_no_rnd_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} - .endfunc - - -@ ---------------------------------------------------------------- - .align 5 -function put_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .endfunc - - .align 5 -function put_no_rnd_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .endfunc - - .ltorg - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align, rnd - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldm r1, {r6-r8} -.elseif \align == 3 - ldm r1, {r5-r7} -.else - ldm r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, =0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 - andeq r14, r14, r14, \rnd #1 - add r8, r8, r10 - add r9, r9, r11 - ldr r12, =0xfcfcfcfc >> 2 - addeq r8, r8, r14 - addeq r9, r9, r14 - and r4, r12, r4, lsr #2 - and r5, r12, r5, lsr #2 - and r6, r12, r6, lsr #2 - and r7, r12, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 - subs r3, r3, #1 -.endm - -.macro RND_XY2_EXPAND align, rnd - RND_XY2_IT \align, \rnd -6: push {r8-r11} - RND_XY2_IT \align, \rnd - pop {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - ldr r14, =0x0f0f0f0f - add r6, r6, r10 - add r7, r7, r11 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - stm r0, {r4-r5} - add r0, r0, r2 - bge 6b - pop {r4-r11,pc} -.endm - - .align 5 -function put_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} @ R14 is also called LR - JMP_ALIGN r5, r1 -1: - RND_XY2_EXPAND 0, lsl - - .align 5 -2: - RND_XY2_EXPAND 1, lsl - - .align 5 -3: - RND_XY2_EXPAND 2, lsl - - .align 5 -4: - RND_XY2_EXPAND 3, lsl - .endfunc - - .align 5 -function put_no_rnd_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - JMP_ALIGN r5, r1 -1: - RND_XY2_EXPAND 0, lsr - - .align 5 -2: - RND_XY2_EXPAND 1, lsr - - .align 5 -3: - RND_XY2_EXPAND 2, lsr - - .align 5 -4: - RND_XY2_EXPAND 3, lsr - .endfunc - - .align 5 -@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride) -function ff_add_pixels_clamped_ARM, export=1 - push {r4-r10} - mov r10, #8 -1: - ldr r4, [r1] /* load dest */ - /* block[0] and block[1]*/ - ldrsh r5, [r0] - ldrsh r7, [r0, #2] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #4] /* moved form [A] */ - orr r9, r9, r8, lsl #8 - /* block[2] and block[3] */ - /* [A] */ - ldrsh r7, [r0, #6] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - ldr r4, [r1, #4] /* moved form [B] */ - orr r9, r9, r8, lsl #24 - /* store dest */ - ldrsh r5, [r0, #8] /* moved form [C] */ - str r9, [r1] - - /* load dest */ - /* [B] */ - /* block[4] and block[5] */ - /* [C] */ - ldrsh r7, [r0, #10] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #12] /* moved from [D] */ - orr r9, r9, r8, lsl #8 - /* block[6] and block[7] */ - /* [D] */ - ldrsh r7, [r0, #14] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - add r0, r0, #16 /* moved from [E] */ - orr r9, r9, r8, lsl #24 - subs r10, r10, #1 /* moved from [F] */ - /* store dest */ - str r9, [r1, #4] - - /* [E] */ - /* [F] */ - add r1, r1, r2 - bne 1b - - pop {r4-r10} - bx lr - .endfunc diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_init_arm.c Sun Oct 04 13:13:02 2009 +0000 @@ -0,0 +1,130 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void j_rev_dct_ARM(DCTELEM *data); +void simple_idct_ARM(DCTELEM *data); + +/* XXX: local hack */ +static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); + +void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8) +CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8) +CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8) +CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8) + +void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest, + int line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + j_rev_dct_ARM (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + j_rev_dct_ARM (block); + ff_add_pixels_clamped(block, dest, line_size); +} +static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + simple_idct_ARM (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + simple_idct_ARM (block); + ff_add_pixels_clamped(block, dest, line_size); +} + +int mm_support(void) +{ + return HAVE_IWMMXT * FF_MM_IWMMXT; +} + +void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx) +{ + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + + if (avctx->lowres == 0) { + if(avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_ARM){ + c->idct_put= j_rev_dct_ARM_put; + c->idct_add= j_rev_dct_ARM_add; + c->idct = j_rev_dct_ARM; + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + } else if (avctx->idct_algo==FF_IDCT_SIMPLEARM){ + c->idct_put= simple_idct_ARM_put; + c->idct_add= simple_idct_ARM_add; + c->idct = simple_idct_ARM; + c->idct_permutation_type= FF_NO_IDCT_PERM; + } + } + + c->put_pixels_tab[0][0] = put_pixels16_arm; + c->put_pixels_tab[0][1] = put_pixels16_x2_arm; + c->put_pixels_tab[0][2] = put_pixels16_y2_arm; + c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; + c->put_pixels_tab[1][0] = put_pixels8_arm; + c->put_pixels_tab[1][1] = put_pixels8_x2_arm; + c->put_pixels_tab[1][2] = put_pixels8_y2_arm; + c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm; + + if (HAVE_ARMV5TE) ff_dsputil_init_armv5te(c, avctx); + if (HAVE_ARMV6) ff_dsputil_init_armv6(c, avctx); + +#if HAVE_IWMMXT + dsputil_init_iwmmxt(c, avctx); +#endif +#if HAVE_ARMVFP + ff_dsputil_init_vfp(c, avctx); +#endif +#if HAVE_NEON + ff_dsputil_init_neon(c, avctx); +#endif +} diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_init_neon.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_init_neon.c Sun Oct 04 13:13:02 2009 +0000 @@ -0,0 +1,340 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_simple_idct_neon(DCTELEM *data); +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_vp3_idct_neon(DCTELEM *data); +void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); + +void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); + +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + +void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); + +void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); + +void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); + +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); + +void ff_vector_fmul_neon(float *dst, const float *src, int len); +void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, + float add_bias, int len); +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, + int len); +void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, + int len); +void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, + int len); +void ff_butterflies_float_neon(float *v1, float *v2, int len); +float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, + float mul, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, + const float *src2, int len); + +void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, + int len); +void ff_float_to_int16_neon(int16_t *, const float *, long); +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); + +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) +{ + if (!avctx->lowres) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put= ff_simple_idct_put_neon; + c->idct_add= ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; + } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || + CONFIG_VP6_DECODER) && + avctx->idct_algo == FF_IDCT_VP3) { + c->idct_put= ff_vp3_idct_put_neon; + c->idct_add= ff_vp3_idct_add_neon; + c->idct = ff_vp3_idct_neon; + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } + } + + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + + if (CONFIG_H264_DECODER) { + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + + c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; + + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; + c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; + c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; + c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; + c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; + c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; + c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; + c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; + c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->h264_idct_add16 = ff_h264_idct_add16_neon; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + c->h264_idct_add8 = ff_h264_idct_add8_neon; + } + + if (CONFIG_VP3_DECODER) { + c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; + c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + } + + c->vector_fmul = ff_vector_fmul_neon; + c->vector_fmul_window = ff_vector_fmul_window_neon; + c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; + c->butterflies_float = ff_butterflies_float_neon; + c->scalarproduct_float = ff_scalarproduct_float_neon; + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; + c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; + c->vector_fmul_add = ff_vector_fmul_add_neon; + + c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; + c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; + + c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; + c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; + + c->vector_clipf = ff_vector_clipf_neon; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + } + + if (CONFIG_VORBIS_DECODER) + c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; +} diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_init_vfp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_init_vfp.c Sun Oct 04 13:13:02 2009 +0000 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_vector_fmul_vfp(float *dst, const float *src, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) +{ + c->vector_fmul = ff_vector_fmul_vfp; + c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; +#if HAVE_ARMV6 + c->float_to_int16 = ff_float_to_int16_vfp; +#endif +} diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arm/dsputil_neon.S Sun Oct 04 13:13:02 2009 +0000 @@ -0,0 +1,1129 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + preserve8 + .text + + .macro pixels16 avg=0 +.if \avg + mov ip, r0 +.endif +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 + vld1.64 {d4, d5}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d6, d7}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] +.if \avg + vld1.64 {d16,d17}, [ip,:128], r2 + vrhadd.u8 q0, q0, q8 + vld1.64 {d18,d19}, [ip,:128], r2 + vrhadd.u8 q1, q1, q9 + vld1.64 {d20,d21}, [ip,:128], r2 + vrhadd.u8 q2, q2, q10 + vld1.64 {d22,d23}, [ip,:128], r2 + vrhadd.u8 q3, q3, q11 +.endif + subs r3, r3, #4 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d2, d3}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + \vhadd q0, q0, q1 + vext.8 q3, q2, q3, #1 + \vhadd q2, q2, q3 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_y2 vhadd=vrhadd.u8 + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 +1: subs r3, r3, #2 + \vhadd q2, q0, q1 + vld1.64 {d0, d1}, [r1], r2 + \vhadd q3, q0, q1 + vld1.64 {d2, d3}, [r1], r2 + pld [r1] + pld [r1, r2] + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 + vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 +.if \no_rnd + vmov.i16 q13, #1 +.endif + pld [r1] + pld [r1, r2] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 +1: subs r3, r3, #2 + vld1.64 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 + pld [r1] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + \vshrn d28, q12, #2 +.if \no_rnd + vadd.u16 q1, q1, q13 +.endif + \vshrn d29, q1, #2 + vaddl.u8 q8, d0, d30 + vld1.64 {d2-d4}, [r1], r2 + vaddl.u8 q10, d1, d31 + vst1.64 {d28,d29}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [r1, r2] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + \vshrn d30, q12, #2 +.if \no_rnd + vadd.u16 q0, q0, q13 +.endif + \vshrn d31, q0, #2 + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.64 {d30,d31}, [r0,:128], r2 + bgt 1b + bx lr + .endm + + .macro pixels8 +1: vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] + subs r3, r3, #4 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + vst1.64 {d2}, [r0,:64], r2 + vst1.64 {d3}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0, d1}, [r1], r2 + vext.8 d1, d0, d1, #1 + vld1.64 {d2, d3}, [r1], r2 + vext.8 d3, d2, d3, #1 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vswp d1, d2 + \vhadd q0, q0, q1 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_y2 vhadd=vrhadd.u8 + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 +1: subs r3, r3, #2 + \vhadd d4, d0, d1 + vld1.64 {d0}, [r1], r2 + \vhadd d5, d0, d1 + vld1.64 {d1}, [r1], r2 + pld [r1] + pld [r1, r2] + vst1.64 {d4}, [r0,:64], r2 + vst1.64 {d5}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 +.if \no_rnd + vmov.i16 q11, #1 +.endif + pld [r1] + pld [r1, r2] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 +1: subs r3, r3, #2 + vld1.64 {d0, d1}, [r1], r2 + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vaddl.u8 q8, d0, d4 + \vshrn d5, q10, #2 + vld1.64 {d2, d3}, [r1], r2 + vadd.u16 q10, q8, q9 + pld [r1, r2] +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vst1.64 {d5}, [r0,:64], r2 + \vshrn d7, q10, #2 + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.64 {d7}, [r0,:64], r2 + bgt 1b + bx lr + .endm + + .macro pixfunc pfx name suf rnd_op args:vararg +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd_op \args + .endfunc + .endm + + .macro pixfunc2 pfx name args:vararg + pixfunc \pfx \name + pixfunc \pfx \name \args + .endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov r3, #16 + .endfunc + + pixfunc put_ pixels16 + pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov r3, #16 + .endfunc + + pixfunc avg_ pixels16,, 1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov r3, #8 + .endfunc + + pixfunc put_ pixels8 + pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 + +function ff_put_pixels_clamped_neon, export=1 + vld1.64 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.64 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.64 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.64 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.64 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.64 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.64 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.64 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.64 {d4}, [r1,:64], r2 + vst1.64 {d5}, [r1,:64], r2 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr + .endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.64 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.64 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.64 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.64 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.64 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.64 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.64 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.64 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.64 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.64 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.64 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.64 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr + .endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.64 {d16}, [r1,:64], r2 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.64 {d2}, [r3,:64], r2 + vld1.64 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.64 {d4}, [r3,:64], r2 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.64 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.64 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.64 {d4}, [r3,:64], r2 + vst1.64 {d6}, [r3,:64], r2 + bx lr + .endfunc + +function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q9, q1, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vshrn.s32 d4, q8, #16 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #16 + vshrn.s32 d5, q9, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vld1.64 {d16-d17},[r1,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r1,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6-d7}, [r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r1,:128]! + vshrn.s32 d4, q8, #16 + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vshrn.s32 d5, q9, #16 + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vst1.64 {d6-d7}, [r0,:128]! + bx lr +3: vshrn.s32 d4, q8, #16 + vshrn.s32 d5, q9, #16 + vst1.64 {d4-d5}, [r0,:128]! + bx lr + .endfunc + +function ff_float_to_int16_interleave_neon, export=1 + cmp r3, #2 + ldrlt r1, [r1] + blt ff_float_to_int16_neon + bne 4f + + ldr r3, [r1] + ldr r1, [r1, #4] + + subs r2, r2, #8 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q9, q1, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q10, q8, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vld1.64 {d26-d27},[r1,:128]! + vsri.32 q11, q9, #16 + vst1.64 {d20-d21},[r0,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q12, q0, #16 + vld1.64 {d16-d17},[r3,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d25},[r0,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r3,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d26-d27},[r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vsri.32 q10, q8, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vsri.32 q11, q9, #16 + vld1.64 {d26-d27},[r1,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d20-d21},[r0,:128]! + vsri.32 q12, q0, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d27},[r0,:128]! + bx lr +3: vsri.32 q10, q8, #16 + vsri.32 q11, q9, #16 + vst1.64 {d20-d23},[r0,:128]! + bx lr + +4: push {r4-r8,lr} + cmp r3, #4 + lsl ip, r3, #1 + blt 4f + + @ 4 channels +5: ldmia r1!, {r4-r7} + mov lr, r2 + mov r8, r0 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #8 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q9, q8, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 q11, q10, #16 + vld1.64 {d4-d5}, [r6,:128]! + vcvt.s32.f32 q2, q2, #16 + vzip.32 d18, d22 + vld1.64 {d6-d7}, [r7,:128]! + vcvt.s32.f32 q3, q3, #16 + vzip.32 d19, d23 + vst1.64 {d18}, [r8], ip + vsri.32 q1, q0, #16 + vst1.64 {d22}, [r8], ip + vsri.32 q3, q2, #16 + vst1.64 {d19}, [r8], ip + vzip.32 d2, d6 + vst1.64 {d23}, [r8], ip + vzip.32 d3, d7 + beq 7f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.64 {d2}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6}, [r8], ip + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.64 {d3}, [r8], ip + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d7}, [r8], ip + b 6b +7: vst1.64 {d2}, [r8], ip + vst1.64 {d6}, [r8], ip + vst1.64 {d3}, [r8], ip + vst1.64 {d7}, [r8], ip + subs r3, r3, #4 + popeq {r4-r8,pc} + cmp r3, #4 + add r0, r0, #8 + bge 5b + + @ 2 channels +4: cmp r3, #2 + blt 4f + ldmia r1!, {r4-r5} + mov lr, r2 + mov r8, r0 + tst lr, #8 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 6f + subs lr, lr, #8 + beq 7f + vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #16 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 d18, d16, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 d19, d17, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r5,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vsri.32 d2, d0, #16 + vst1.32 {d19[1]}, [r8], ip + vsri.32 d3, d1, #16 + vst1.32 {d22[0]}, [r8], ip + vsri.32 d6, d4, #16 + vst1.32 {d22[1]}, [r8], ip + vsri.32 d7, d5, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + beq 6f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + bgt 6b +6: vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + b 8f +7: vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip +8: subs r3, r3, #2 + add r0, r0, #4 + popeq {r4-r8,pc} + + @ 1 channel +4: ldr r4, [r1],#4 + tst r2, #8 + mov lr, r2 + mov r5, r0 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + bne 8f +6: subs lr, lr, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r4,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + beq 7f + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 +7: vst1.16 {d4[1]}, [r5,:16], ip + vst1.16 {d4[3]}, [r5,:16], ip + vst1.16 {d5[1]}, [r5,:16], ip + vst1.16 {d5[3]}, [r5,:16], ip + vst1.16 {d6[1]}, [r5,:16], ip + vst1.16 {d6[3]}, [r5,:16], ip + vst1.16 {d7[1]}, [r5,:16], ip + vst1.16 {d7[3]}, [r5,:16], ip + bgt 6b + pop {r4-r8,pc} +8: subs lr, lr, #8 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + popeq {r4-r8,pc} + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + b 6b + .endfunc + +function ff_vector_fmul_neon, export=1 + mov r3, r0 + subs r2, r2, #8 + vld1.64 {d0-d3}, [r0,:128]! + vld1.64 {d4-d7}, [r1,:128]! + vmul.f32 q8, q0, q2 + vmul.f32 q9, q1, q3 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vmul.f32 q10, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vmul.f32 q11, q1, q3 + vst1.64 {d16-d19},[r3,:128]! + vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vmul.f32 q8, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vmul.f32 q9, q1, q3 + vst1.64 {d20-d23},[r3,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vst1.64 {d16-d17},[r3,:128]! + vmul.f32 q8, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vst1.64 {d18-d19},[r3,:128]! + vmul.f32 q9, q1, q3 +3: vst1.64 {d16-d19},[r3,:128]! + bx lr + .endfunc + +function ff_vector_fmul_window_neon, export=1 +VFP vdup.32 q8, d0[0] +NOVFP vld1.32 {d16[],d17[]}, [sp,:32] + push {r4,r5,lr} +VFP ldr lr, [sp, #12] +NOVFP ldr lr, [sp, #16] + sub r2, r2, #8 + sub r5, lr, #2 + add r2, r2, r5, lsl #2 + add r4, r3, r5, lsl #3 + add ip, r0, r5, lsl #3 + mov r5, #-16 + vld1.64 {d0,d1}, [r1,:128]! + vld1.64 {d2,d3}, [r2,:128], r5 + vld1.64 {d4,d5}, [r3,:128]! + vld1.64 {d6,d7}, [r4,:128], r5 +1: subs lr, lr, #4 + vmov q11, q8 + vmla.f32 d22, d0, d4 + vmov q10, q8 + vmla.f32 d23, d1, d5 + vrev64.32 q3, q3 + vmla.f32 d20, d0, d7 + vrev64.32 q1, q1 + vmla.f32 d21, d1, d6 + beq 2f + vmla.f32 d22, d3, d7 + vld1.64 {d0,d1}, [r1,:128]! + vmla.f32 d23, d2, d6 + vld1.64 {d18,d19},[r2,:128], r5 + vmls.f32 d20, d3, d4 + vld1.64 {d24,d25},[r3,:128]! + vmls.f32 d21, d2, d5 + vld1.64 {d6,d7}, [r4,:128], r5 + vmov q1, q9 + vrev64.32 q11, q11 + vmov q2, q12 + vswp d22, d23 + vst1.64 {d20,d21},[r0,:128]! + vst1.64 {d22,d23},[ip,:128], r5 + b 1b +2: vmla.f32 d22, d3, d7 + vmla.f32 d23, d2, d6 + vmls.f32 d20, d3, d4 + vmls.f32 d21, d2, d5 + vrev64.32 q11, q11 + vswp d22, d23 + vst1.64 {d20,d21},[r0,:128]! + vst1.64 {d22,d23},[ip,:128], r5 + pop {r4,r5,pc} + .endfunc + +#if CONFIG_VORBIS_DECODER +function ff_vorbis_inverse_coupling_neon, export=1 + vmov.i32 q10, #1<<31 + subs r2, r2, #4 + mov r3, r0 + mov r12, r1 + beq 3f + + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 +1: vld1.32 {d2-d3}, [r1,:128]! + vld1.32 {d0-d1}, [r0,:128]! + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vst1.32 {d24-d25},[r3, :128]! + vst1.32 {d22-d23},[r12,:128]! + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + subs r2, r2, #8 + ble 2f + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 + b 1b + +2: vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + bxlt lr + +3: vld1.32 {d2-d3}, [r1,:128] + vld1.32 {d0-d1}, [r0,:128] + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + vst1.32 {d2-d3}, [r0,:128]! + vst1.32 {d0-d1}, [r1,:128]! + bx lr + .endfunc +#endif + +function ff_vector_fmul_scalar_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + bics r12, len, #15 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q1},[r1,:128]! +1: vmul.f32 q0, q0, q8 + vld1.32 {q2},[r1,:128]! + vmul.f32 q1, q1, q8 + vld1.32 {q3},[r1,:128]! + vmul.f32 q2, q2, q8 + vst1.32 {q0},[r0,:128]! + vmul.f32 q3, q3, q8 + vst1.32 {q1},[r0,:128]! + subs r12, r12, #16 + beq 2f + vld1.32 {q0},[r1,:128]! + vst1.32 {q2},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vst1.32 {q3},[r0,:128]! + b 1b +2: vst1.32 {q2},[r0,:128]! + vst1.32 {q3},[r0,:128]! + ands len, len, #15 + bxeq lr +3: vld1.32 {q0},[r1,:128]! + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 3b + bx lr + .unreq len + .endfunc + +function ff_vector_fmul_sv_scalar_2_neon, export=1 +VFP vdup.32 d16, d0[0] +NOVFP vdup.32 d16, r3 +NOVFP ldr r3, [sp] + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! +1: subs r3, r3, #4 + vmul.f32 d4, d0, d16 + vmul.f32 d5, d1, d16 + ldr r12, [r2], #4 + vld1.32 {d2},[r12,:64] + ldr r12, [r2], #4 + vld1.32 {d3},[r12,:64] + vmul.f32 d4, d4, d2 + vmul.f32 d5, d5, d3 + beq 2f + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! + vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + b 1b +2: vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + bx lr + .endfunc + +function ff_vector_fmul_sv_scalar_4_neon, export=1 +VFP vdup.32 q10, d0[0] +NOVFP vdup.32 q10, r3 +NOVFP ldr r3, [sp] + push {lr} + bics lr, r3, #7 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! +1: ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + ldr r12, [r2], #4 + vld1.32 {q3},[r12,:128] + vmul.f32 q8, q0, q10 + vmul.f32 q8, q8, q1 + vmul.f32 q9, q2, q10 + vmul.f32 q9, q9, q3 + subs lr, lr, #8 + beq 2f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! + vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + b 1b +2: vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + ands r3, r3, #7 + popeq {pc} +3: vld1.32 {q0},[r1,:128]! + ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + vmul.f32 q0, q0, q10 + vmul.f32 q0, q0, q1 + vst1.32 {q0},[r0,:128]! + subs r3, r3, #4 + bgt 3b + pop {pc} + .endfunc + +function ff_sv_fmul_scalar_2_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] +1: vmul.f32 q1, q0, q8 + subs len, len, #4 + beq 2f + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] + vst1.32 {q1},[r0,:128]! + b 1b +2: vst1.32 {q1},[r0,:128]! + bx lr + .unreq len + .endfunc + +function ff_sv_fmul_scalar_4_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 +1: ldr r12, [r1], #4 + vld1.32 {q0},[r12,:128] + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 1b + bx lr + .unreq len + .endfunc + +function ff_butterflies_float_neon, export=1 +1: vld1.32 {q0},[r0,:128] + vld1.32 {q1},[r1,:128] + vsub.f32 q2, q0, q1 + vadd.f32 q1, q0, q1 + vst1.32 {q2},[r1,:128]! + vst1.32 {q1},[r0,:128]! + subs r2, r2, #4 + bgt 1b + bx lr + .endfunc + +function ff_scalarproduct_float_neon, export=1 + vmov.f32 q2, #0.0 +1: vld1.32 {q0},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vmla.f32 q2, q0, q1 + subs r2, r2, #4 + bgt 1b + vadd.f32 d0, d4, d5 + vpadd.f32 d0, d0, d0 +NOVFP vmov.32 r0, d0[0] + bx lr + .endfunc + +function ff_int32_to_float_fmul_scalar_neon, export=1 +VFP vdup.32 q0, d0[0] +VFP len .req r2 +NOVFP vdup.32 q0, r2 +NOVFP len .req r3 + + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 +1: subs len, len, #8 + pld [r1, #16] + vmul.f32 q9, q3, q0 + vmul.f32 q10, q8, q0 + beq 2f + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 + vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + b 1b +2: vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + bx lr + .unreq len + .endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr + .endfunc + +function ff_vector_fmul_add_neon, export=1 + ldr r12, [sp] + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q8-q9}, [r2,:128]! + vld1.32 {q2-q3}, [r3,:128]! + vmul.f32 q10, q0, q8 + vmul.f32 q11, q1, q9 +1: vadd.f32 q12, q2, q10 + vadd.f32 q13, q3, q11 + pld [r1, #16] + pld [r2, #16] + pld [r3, #16] + subs r12, r12, #8 + beq 2f + vld1.32 {q0}, [r1,:128]! + vld1.32 {q8}, [r2,:128]! + vmul.f32 q10, q0, q8 + vld1.32 {q1}, [r1,:128]! + vld1.32 {q9}, [r2,:128]! + vmul.f32 q11, q1, q9 + vld1.32 {q2-q3}, [r3,:128]! + vst1.32 {q12-q13},[r0,:128]! + b 1b +2: vst1.32 {q12-q13},[r0,:128]! + bx lr + .endfunc + +function ff_vector_clipf_neon, export=1 +VFP vdup.32 q1, d0[1] +VFP vdup.32 q0, d0[0] +NOVFP vdup.32 q0, r2 +NOVFP vdup.32 q1, r3 +NOVFP ldr r2, [sp] + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 +1: vmax.f32 q8, q10, q0 + vmax.f32 q9, q11, q0 + subs r2, r2, #8 + beq 2f + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 + vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + b 1b +2: vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + bx lr + .endfunc diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_neon.c --- a/arm/dsputil_neon.c Sun Oct 04 13:12:55 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,340 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_neon(DCTELEM *data); -void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_vp3_idct_neon(DCTELEM *data); -void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); - -void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); - -void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); -void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); -void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); - -void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); - -void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); - -void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); - -void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); - -void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); -void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); - -void ff_vector_fmul_neon(float *dst, const float *src, int len); -void ff_vector_fmul_window_neon(float *dst, const float *src0, - const float *src1, const float *win, - float add_bias, int len); -void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, - int len); -void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, - const float **vp, float mul, int len); -void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, - const float **vp, float mul, int len); -void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, - int len); -void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, - int len); -void ff_butterflies_float_neon(float *v1, float *v2, int len); -float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, - float mul, int len); -void ff_vector_fmul_reverse_neon(float *dst, const float *src0, - const float *src1, int len); -void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, - const float *src2, int len); - -void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, - int len); -void ff_float_to_int16_neon(int16_t *, const float *, long); -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); - -void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); - -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) -{ - if (!avctx->lowres) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLENEON) { - c->idct_put= ff_simple_idct_put_neon; - c->idct_add= ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; - } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || - CONFIG_VP6_DECODER) && - avctx->idct_algo == FF_IDCT_VP3) { - c->idct_put= ff_vp3_idct_put_neon; - c->idct_add= ff_vp3_idct_add_neon; - c->idct = ff_vp3_idct_neon; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - c->put_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; - c->put_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; - - c->add_pixels_clamped = ff_add_pixels_clamped_neon; - c->put_pixels_clamped = ff_put_pixels_clamped_neon; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; - - if (CONFIG_H264_DECODER) { - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; - - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; - - c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; - c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; - c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; - c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; - c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; - c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; - c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; - c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; - c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; - c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; - c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; - c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; - c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; - c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; - c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; - c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; - - c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; - c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; - c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; - c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; - c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; - c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; - c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; - c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; - c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; - c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; - c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; - c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; - c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; - c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; - c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; - c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; - - c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; - - c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; - c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; - c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; - c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; - - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; - c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; - c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; - c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; - - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; - c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; - c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; - c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; - - c->h264_idct_add = ff_h264_idct_add_neon; - c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; - c->h264_idct_add16 = ff_h264_idct_add16_neon; - c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; - c->h264_idct_add8 = ff_h264_idct_add8_neon; - } - - if (CONFIG_VP3_DECODER) { - c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; - c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; - } - - c->vector_fmul = ff_vector_fmul_neon; - c->vector_fmul_window = ff_vector_fmul_window_neon; - c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; - c->butterflies_float = ff_butterflies_float_neon; - c->scalarproduct_float = ff_scalarproduct_float_neon; - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; - c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; - c->vector_fmul_add = ff_vector_fmul_add_neon; - - c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; - c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; - - c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; - c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; - - c->vector_clipf = ff_vector_clipf_neon; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = ff_float_to_int16_neon; - c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; - } - - if (CONFIG_VORBIS_DECODER) - c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; -} diff -r 48be79afc72d -r b72bb442a775 arm/dsputil_neon_s.S --- a/arm/dsputil_neon_s.S Sun Oct 04 13:12:55 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1129 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - - preserve8 - .text - - .macro pixels16 avg=0 -.if \avg - mov ip, r0 -.endif -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 - vld1.64 {d4, d5}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d6, d7}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] -.if \avg - vld1.64 {d16,d17}, [ip,:128], r2 - vrhadd.u8 q0, q0, q8 - vld1.64 {d18,d19}, [ip,:128], r2 - vrhadd.u8 q1, q1, q9 - vld1.64 {d20,d21}, [ip,:128], r2 - vrhadd.u8 q2, q2, q10 - vld1.64 {d22,d23}, [ip,:128], r2 - vrhadd.u8 q3, q3, q11 -.endif - subs r3, r3, #4 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d2, d3}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0-d2}, [r1], r2 - vld1.64 {d4-d6}, [r1], r2 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vext.8 q1, q0, q1, #1 - \vhadd q0, q0, q1 - vext.8 q3, q2, q3, #1 - \vhadd q2, q2, q3 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_y2 vhadd=vrhadd.u8 - vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 -1: subs r3, r3, #2 - \vhadd q2, q0, q1 - vld1.64 {d0, d1}, [r1], r2 - \vhadd q3, q0, q1 - vld1.64 {d2, d3}, [r1], r2 - pld [r1] - pld [r1, r2] - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 - vld1.64 {d0-d2}, [r1], r2 - vld1.64 {d4-d6}, [r1], r2 -.if \no_rnd - vmov.i16 q13, #1 -.endif - pld [r1] - pld [r1, r2] - vext.8 q1, q0, q1, #1 - vext.8 q3, q2, q3, #1 - vaddl.u8 q8, d0, d2 - vaddl.u8 q10, d1, d3 - vaddl.u8 q9, d4, d6 - vaddl.u8 q11, d5, d7 -1: subs r3, r3, #2 - vld1.64 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 - pld [r1] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - \vshrn d28, q12, #2 -.if \no_rnd - vadd.u16 q1, q1, q13 -.endif - \vshrn d29, q1, #2 - vaddl.u8 q8, d0, d30 - vld1.64 {d2-d4}, [r1], r2 - vaddl.u8 q10, d1, d31 - vst1.64 {d28,d29}, [r0,:128], r2 - vadd.u16 q12, q8, q9 - pld [r1, r2] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q2, q1, q2, #1 - vadd.u16 q0, q10, q11 - \vshrn d30, q12, #2 -.if \no_rnd - vadd.u16 q0, q0, q13 -.endif - \vshrn d31, q0, #2 - vaddl.u8 q9, d2, d4 - vaddl.u8 q11, d3, d5 - vst1.64 {d30,d31}, [r0,:128], r2 - bgt 1b - bx lr - .endm - - .macro pixels8 -1: vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - subs r3, r3, #4 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - vst1.64 {d2}, [r0,:64], r2 - vst1.64 {d3}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0, d1}, [r1], r2 - vext.8 d1, d0, d1, #1 - vld1.64 {d2, d3}, [r1], r2 - vext.8 d3, d2, d3, #1 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vswp d1, d2 - \vhadd q0, q0, q1 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_y2 vhadd=vrhadd.u8 - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 -1: subs r3, r3, #2 - \vhadd d4, d0, d1 - vld1.64 {d0}, [r1], r2 - \vhadd d5, d0, d1 - vld1.64 {d1}, [r1], r2 - pld [r1] - pld [r1, r2] - vst1.64 {d4}, [r0,:64], r2 - vst1.64 {d5}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 - vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 -.if \no_rnd - vmov.i16 q11, #1 -.endif - pld [r1] - pld [r1, r2] - vext.8 d4, d0, d1, #1 - vext.8 d6, d2, d3, #1 - vaddl.u8 q8, d0, d4 - vaddl.u8 q9, d2, d6 -1: subs r3, r3, #2 - vld1.64 {d0, d1}, [r1], r2 - pld [r1] - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vaddl.u8 q8, d0, d4 - \vshrn d5, q10, #2 - vld1.64 {d2, d3}, [r1], r2 - vadd.u16 q10, q8, q9 - pld [r1, r2] -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vst1.64 {d5}, [r0,:64], r2 - \vshrn d7, q10, #2 - vext.8 d6, d2, d3, #1 - vaddl.u8 q9, d2, d6 - vst1.64 {d7}, [r0,:64], r2 - bgt 1b - bx lr - .endm - - .macro pixfunc pfx name suf rnd_op args:vararg -function ff_\pfx\name\suf\()_neon, export=1 - \name \rnd_op \args - .endfunc - .endm - - .macro pixfunc2 pfx name args:vararg - pixfunc \pfx \name - pixfunc \pfx \name \args - .endm - -function ff_put_h264_qpel16_mc00_neon, export=1 - mov r3, #16 - .endfunc - - pixfunc put_ pixels16 - pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 - -function ff_avg_h264_qpel16_mc00_neon, export=1 - mov r3, #16 - .endfunc - - pixfunc avg_ pixels16,, 1 - -function ff_put_h264_qpel8_mc00_neon, export=1 - mov r3, #8 - .endfunc - - pixfunc put_ pixels8 - pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 - -function ff_put_pixels_clamped_neon, export=1 - vld1.64 {d16-d19}, [r0,:128]! - vqmovun.s16 d0, q8 - vld1.64 {d20-d23}, [r0,:128]! - vqmovun.s16 d1, q9 - vld1.64 {d24-d27}, [r0,:128]! - vqmovun.s16 d2, q10 - vld1.64 {d28-d31}, [r0,:128]! - vqmovun.s16 d3, q11 - vst1.64 {d0}, [r1,:64], r2 - vqmovun.s16 d4, q12 - vst1.64 {d1}, [r1,:64], r2 - vqmovun.s16 d5, q13 - vst1.64 {d2}, [r1,:64], r2 - vqmovun.s16 d6, q14 - vst1.64 {d3}, [r1,:64], r2 - vqmovun.s16 d7, q15 - vst1.64 {d4}, [r1,:64], r2 - vst1.64 {d5}, [r1,:64], r2 - vst1.64 {d6}, [r1,:64], r2 - vst1.64 {d7}, [r1,:64], r2 - bx lr - .endfunc - -function ff_put_signed_pixels_clamped_neon, export=1 - vmov.u8 d31, #128 - vld1.64 {d16-d17}, [r0,:128]! - vqmovn.s16 d0, q8 - vld1.64 {d18-d19}, [r0,:128]! - vqmovn.s16 d1, q9 - vld1.64 {d16-d17}, [r0,:128]! - vqmovn.s16 d2, q8 - vld1.64 {d18-d19}, [r0,:128]! - vadd.u8 d0, d0, d31 - vld1.64 {d20-d21}, [r0,:128]! - vadd.u8 d1, d1, d31 - vld1.64 {d22-d23}, [r0,:128]! - vadd.u8 d2, d2, d31 - vst1.64 {d0}, [r1,:64], r2 - vqmovn.s16 d3, q9 - vst1.64 {d1}, [r1,:64], r2 - vqmovn.s16 d4, q10 - vst1.64 {d2}, [r1,:64], r2 - vqmovn.s16 d5, q11 - vld1.64 {d24-d25}, [r0,:128]! - vadd.u8 d3, d3, d31 - vld1.64 {d26-d27}, [r0,:128]! - vadd.u8 d4, d4, d31 - vadd.u8 d5, d5, d31 - vst1.64 {d3}, [r1,:64], r2 - vqmovn.s16 d6, q12 - vst1.64 {d4}, [r1,:64], r2 - vqmovn.s16 d7, q13 - vst1.64 {d5}, [r1,:64], r2 - vadd.u8 d6, d6, d31 - vadd.u8 d7, d7, d31 - vst1.64 {d6}, [r1,:64], r2 - vst1.64 {d7}, [r1,:64], r2 - bx lr - .endfunc - -function ff_add_pixels_clamped_neon, export=1 - mov r3, r1 - vld1.64 {d16}, [r1,:64], r2 - vld1.64 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vld1.64 {d17}, [r1,:64], r2 - vld1.64 {d2-d3}, [r0,:128]! - vqmovun.s16 d0, q0 - vld1.64 {d18}, [r1,:64], r2 - vaddw.u8 q1, q1, d17 - vld1.64 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.64 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.64 {d19}, [r1,:64], r2 - vld1.64 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vqmovun.s16 d4, q2 - vst1.64 {d2}, [r3,:64], r2 - vld1.64 {d16}, [r1,:64], r2 - vqmovun.s16 d6, q3 - vld1.64 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vst1.64 {d4}, [r3,:64], r2 - vld1.64 {d17}, [r1,:64], r2 - vld1.64 {d2-d3}, [r0,:128]! - vaddw.u8 q1, q1, d17 - vst1.64 {d6}, [r3,:64], r2 - vqmovun.s16 d0, q0 - vld1.64 {d18}, [r1,:64], r2 - vld1.64 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.64 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.64 {d19}, [r1,:64], r2 - vqmovun.s16 d4, q2 - vld1.64 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vst1.64 {d2}, [r3,:64], r2 - vqmovun.s16 d6, q3 - vst1.64 {d4}, [r3,:64], r2 - vst1.64 {d6}, [r3,:64], r2 - bx lr - .endfunc - -function ff_float_to_int16_neon, export=1 - subs r2, r2, #8 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q9, q1, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vshrn.s32 d4, q8, #16 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #16 - vshrn.s32 d5, q9, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vld1.64 {d16-d17},[r1,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r1,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6-d7}, [r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r1,:128]! - vshrn.s32 d4, q8, #16 - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vshrn.s32 d5, q9, #16 - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vst1.64 {d6-d7}, [r0,:128]! - bx lr -3: vshrn.s32 d4, q8, #16 - vshrn.s32 d5, q9, #16 - vst1.64 {d4-d5}, [r0,:128]! - bx lr - .endfunc - -function ff_float_to_int16_interleave_neon, export=1 - cmp r3, #2 - ldrlt r1, [r1] - blt ff_float_to_int16_neon - bne 4f - - ldr r3, [r1] - ldr r1, [r1, #4] - - subs r2, r2, #8 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q9, q1, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q10, q8, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vld1.64 {d26-d27},[r1,:128]! - vsri.32 q11, q9, #16 - vst1.64 {d20-d21},[r0,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q12, q0, #16 - vld1.64 {d16-d17},[r3,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d25},[r0,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r3,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d26-d27},[r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vsri.32 q10, q8, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vsri.32 q11, q9, #16 - vld1.64 {d26-d27},[r1,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d20-d21},[r0,:128]! - vsri.32 q12, q0, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d27},[r0,:128]! - bx lr -3: vsri.32 q10, q8, #16 - vsri.32 q11, q9, #16 - vst1.64 {d20-d23},[r0,:128]! - bx lr - -4: push {r4-r8,lr} - cmp r3, #4 - lsl ip, r3, #1 - blt 4f - - @ 4 channels -5: ldmia r1!, {r4-r7} - mov lr, r2 - mov r8, r0 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #8 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q9, q8, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 q11, q10, #16 - vld1.64 {d4-d5}, [r6,:128]! - vcvt.s32.f32 q2, q2, #16 - vzip.32 d18, d22 - vld1.64 {d6-d7}, [r7,:128]! - vcvt.s32.f32 q3, q3, #16 - vzip.32 d19, d23 - vst1.64 {d18}, [r8], ip - vsri.32 q1, q0, #16 - vst1.64 {d22}, [r8], ip - vsri.32 q3, q2, #16 - vst1.64 {d19}, [r8], ip - vzip.32 d2, d6 - vst1.64 {d23}, [r8], ip - vzip.32 d3, d7 - beq 7f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.64 {d2}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6}, [r8], ip - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.64 {d3}, [r8], ip - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d7}, [r8], ip - b 6b -7: vst1.64 {d2}, [r8], ip - vst1.64 {d6}, [r8], ip - vst1.64 {d3}, [r8], ip - vst1.64 {d7}, [r8], ip - subs r3, r3, #4 - popeq {r4-r8,pc} - cmp r3, #4 - add r0, r0, #8 - bge 5b - - @ 2 channels -4: cmp r3, #2 - blt 4f - ldmia r1!, {r4-r5} - mov lr, r2 - mov r8, r0 - tst lr, #8 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 6f - subs lr, lr, #8 - beq 7f - vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #16 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 d18, d16, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 d19, d17, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r5,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vsri.32 d2, d0, #16 - vst1.32 {d19[1]}, [r8], ip - vsri.32 d3, d1, #16 - vst1.32 {d22[0]}, [r8], ip - vsri.32 d6, d4, #16 - vst1.32 {d22[1]}, [r8], ip - vsri.32 d7, d5, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - beq 6f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - bgt 6b -6: vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - b 8f -7: vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip -8: subs r3, r3, #2 - add r0, r0, #4 - popeq {r4-r8,pc} - - @ 1 channel -4: ldr r4, [r1],#4 - tst r2, #8 - mov lr, r2 - mov r5, r0 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - bne 8f -6: subs lr, lr, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r4,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - beq 7f - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 -7: vst1.16 {d4[1]}, [r5,:16], ip - vst1.16 {d4[3]}, [r5,:16], ip - vst1.16 {d5[1]}, [r5,:16], ip - vst1.16 {d5[3]}, [r5,:16], ip - vst1.16 {d6[1]}, [r5,:16], ip - vst1.16 {d6[3]}, [r5,:16], ip - vst1.16 {d7[1]}, [r5,:16], ip - vst1.16 {d7[3]}, [r5,:16], ip - bgt 6b - pop {r4-r8,pc} -8: subs lr, lr, #8 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - popeq {r4-r8,pc} - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - b 6b - .endfunc - -function ff_vector_fmul_neon, export=1 - mov r3, r0 - subs r2, r2, #8 - vld1.64 {d0-d3}, [r0,:128]! - vld1.64 {d4-d7}, [r1,:128]! - vmul.f32 q8, q0, q2 - vmul.f32 q9, q1, q3 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vmul.f32 q10, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vmul.f32 q11, q1, q3 - vst1.64 {d16-d19},[r3,:128]! - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vmul.f32 q9, q1, q3 - vst1.64 {d20-d23},[r3,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vst1.64 {d16-d17},[r3,:128]! - vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vst1.64 {d18-d19},[r3,:128]! - vmul.f32 q9, q1, q3 -3: vst1.64 {d16-d19},[r3,:128]! - bx lr - .endfunc - -function ff_vector_fmul_window_neon, export=1 -VFP vdup.32 q8, d0[0] -NOVFP vld1.32 {d16[],d17[]}, [sp,:32] - push {r4,r5,lr} -VFP ldr lr, [sp, #12] -NOVFP ldr lr, [sp, #16] - sub r2, r2, #8 - sub r5, lr, #2 - add r2, r2, r5, lsl #2 - add r4, r3, r5, lsl #3 - add ip, r0, r5, lsl #3 - mov r5, #-16 - vld1.64 {d0,d1}, [r1,:128]! - vld1.64 {d2,d3}, [r2,:128], r5 - vld1.64 {d4,d5}, [r3,:128]! - vld1.64 {d6,d7}, [r4,:128], r5 -1: subs lr, lr, #4 - vmov q11, q8 - vmla.f32 d22, d0, d4 - vmov q10, q8 - vmla.f32 d23, d1, d5 - vrev64.32 q3, q3 - vmla.f32 d20, d0, d7 - vrev64.32 q1, q1 - vmla.f32 d21, d1, d6 - beq 2f - vmla.f32 d22, d3, d7 - vld1.64 {d0,d1}, [r1,:128]! - vmla.f32 d23, d2, d6 - vld1.64 {d18,d19},[r2,:128], r5 - vmls.f32 d20, d3, d4 - vld1.64 {d24,d25},[r3,:128]! - vmls.f32 d21, d2, d5 - vld1.64 {d6,d7}, [r4,:128], r5 - vmov q1, q9 - vrev64.32 q11, q11 - vmov q2, q12 - vswp d22, d23 - vst1.64 {d20,d21},[r0,:128]! - vst1.64 {d22,d23},[ip,:128], r5 - b 1b -2: vmla.f32 d22, d3, d7 - vmla.f32 d23, d2, d6 - vmls.f32 d20, d3, d4 - vmls.f32 d21, d2, d5 - vrev64.32 q11, q11 - vswp d22, d23 - vst1.64 {d20,d21},[r0,:128]! - vst1.64 {d22,d23},[ip,:128], r5 - pop {r4,r5,pc} - .endfunc - -#if CONFIG_VORBIS_DECODER -function ff_vorbis_inverse_coupling_neon, export=1 - vmov.i32 q10, #1<<31 - subs r2, r2, #4 - mov r3, r0 - mov r12, r1 - beq 3f - - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 -1: vld1.32 {d2-d3}, [r1,:128]! - vld1.32 {d0-d1}, [r0,:128]! - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vst1.32 {d24-d25},[r3, :128]! - vst1.32 {d22-d23},[r12,:128]! - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - subs r2, r2, #8 - ble 2f - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 - b 1b - -2: vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - bxlt lr - -3: vld1.32 {d2-d3}, [r1,:128] - vld1.32 {d0-d1}, [r0,:128] - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - vst1.32 {d2-d3}, [r0,:128]! - vst1.32 {d0-d1}, [r1,:128]! - bx lr - .endfunc -#endif - -function ff_vector_fmul_scalar_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 - bics r12, len, #15 - beq 3f - vld1.32 {q0},[r1,:128]! - vld1.32 {q1},[r1,:128]! -1: vmul.f32 q0, q0, q8 - vld1.32 {q2},[r1,:128]! - vmul.f32 q1, q1, q8 - vld1.32 {q3},[r1,:128]! - vmul.f32 q2, q2, q8 - vst1.32 {q0},[r0,:128]! - vmul.f32 q3, q3, q8 - vst1.32 {q1},[r0,:128]! - subs r12, r12, #16 - beq 2f - vld1.32 {q0},[r1,:128]! - vst1.32 {q2},[r0,:128]! - vld1.32 {q1},[r1,:128]! - vst1.32 {q3},[r0,:128]! - b 1b -2: vst1.32 {q2},[r0,:128]! - vst1.32 {q3},[r0,:128]! - ands len, len, #15 - bxeq lr -3: vld1.32 {q0},[r1,:128]! - vmul.f32 q0, q0, q8 - vst1.32 {q0},[r0,:128]! - subs len, len, #4 - bgt 3b - bx lr - .unreq len - .endfunc - -function ff_vector_fmul_sv_scalar_2_neon, export=1 -VFP vdup.32 d16, d0[0] -NOVFP vdup.32 d16, r3 -NOVFP ldr r3, [sp] - vld1.32 {d0},[r1,:64]! - vld1.32 {d1},[r1,:64]! -1: subs r3, r3, #4 - vmul.f32 d4, d0, d16 - vmul.f32 d5, d1, d16 - ldr r12, [r2], #4 - vld1.32 {d2},[r12,:64] - ldr r12, [r2], #4 - vld1.32 {d3},[r12,:64] - vmul.f32 d4, d4, d2 - vmul.f32 d5, d5, d3 - beq 2f - vld1.32 {d0},[r1,:64]! - vld1.32 {d1},[r1,:64]! - vst1.32 {d4},[r0,:64]! - vst1.32 {d5},[r0,:64]! - b 1b -2: vst1.32 {d4},[r0,:64]! - vst1.32 {d5},[r0,:64]! - bx lr - .endfunc - -function ff_vector_fmul_sv_scalar_4_neon, export=1 -VFP vdup.32 q10, d0[0] -NOVFP vdup.32 q10, r3 -NOVFP ldr r3, [sp] - push {lr} - bics lr, r3, #7 - beq 3f - vld1.32 {q0},[r1,:128]! - vld1.32 {q2},[r1,:128]! -1: ldr r12, [r2], #4 - vld1.32 {q1},[r12,:128] - ldr r12, [r2], #4 - vld1.32 {q3},[r12,:128] - vmul.f32 q8, q0, q10 - vmul.f32 q8, q8, q1 - vmul.f32 q9, q2, q10 - vmul.f32 q9, q9, q3 - subs lr, lr, #8 - beq 2f - vld1.32 {q0},[r1,:128]! - vld1.32 {q2},[r1,:128]! - vst1.32 {q8},[r0,:128]! - vst1.32 {q9},[r0,:128]! - b 1b -2: vst1.32 {q8},[r0,:128]! - vst1.32 {q9},[r0,:128]! - ands r3, r3, #7 - popeq {pc} -3: vld1.32 {q0},[r1,:128]! - ldr r12, [r2], #4 - vld1.32 {q1},[r12,:128] - vmul.f32 q0, q0, q10 - vmul.f32 q0, q0, q1 - vst1.32 {q0},[r0,:128]! - subs r3, r3, #4 - bgt 3b - pop {pc} - .endfunc - -function ff_sv_fmul_scalar_2_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 - ldr r12, [r1], #4 - vld1.32 {d0},[r12,:64] - ldr r12, [r1], #4 - vld1.32 {d1},[r12,:64] -1: vmul.f32 q1, q0, q8 - subs len, len, #4 - beq 2f - ldr r12, [r1], #4 - vld1.32 {d0},[r12,:64] - ldr r12, [r1], #4 - vld1.32 {d1},[r12,:64] - vst1.32 {q1},[r0,:128]! - b 1b -2: vst1.32 {q1},[r0,:128]! - bx lr - .unreq len - .endfunc - -function ff_sv_fmul_scalar_4_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 -1: ldr r12, [r1], #4 - vld1.32 {q0},[r12,:128] - vmul.f32 q0, q0, q8 - vst1.32 {q0},[r0,:128]! - subs len, len, #4 - bgt 1b - bx lr - .unreq len - .endfunc - -function ff_butterflies_float_neon, export=1 -1: vld1.32 {q0},[r0,:128] - vld1.32 {q1},[r1,:128] - vsub.f32 q2, q0, q1 - vadd.f32 q1, q0, q1 - vst1.32 {q2},[r1,:128]! - vst1.32 {q1},[r0,:128]! - subs r2, r2, #4 - bgt 1b - bx lr - .endfunc - -function ff_scalarproduct_float_neon, export=1 - vmov.f32 q2, #0.0 -1: vld1.32 {q0},[r0,:128]! - vld1.32 {q1},[r1,:128]! - vmla.f32 q2, q0, q1 - subs r2, r2, #4 - bgt 1b - vadd.f32 d0, d4, d5 - vpadd.f32 d0, d0, d0 -NOVFP vmov.32 r0, d0[0] - bx lr - .endfunc - -function ff_int32_to_float_fmul_scalar_neon, export=1 -VFP vdup.32 q0, d0[0] -VFP len .req r2 -NOVFP vdup.32 q0, r2 -NOVFP len .req r3 - - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 -1: subs len, len, #8 - pld [r1, #16] - vmul.f32 q9, q3, q0 - vmul.f32 q10, q8, q0 - beq 2f - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 - vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - b 1b -2: vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - bx lr - .unreq len - .endfunc - -function ff_vector_fmul_reverse_neon, export=1 - add r2, r2, r3, lsl #2 - sub r2, r2, #32 - mov r12, #-32 - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 -1: pld [r1, #32] - vrev64.32 q3, q3 - vmul.f32 d16, d0, d7 - vmul.f32 d17, d1, d6 - pld [r2, #-32] - vrev64.32 q2, q2 - vmul.f32 d18, d2, d5 - vmul.f32 d19, d3, d4 - subs r3, r3, #8 - beq 2f - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 - vst1.32 {q8-q9}, [r0,:128]! - b 1b -2: vst1.32 {q8-q9}, [r0,:128]! - bx lr - .endfunc - -function ff_vector_fmul_add_neon, export=1 - ldr r12, [sp] - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q8-q9}, [r2,:128]! - vld1.32 {q2-q3}, [r3,:128]! - vmul.f32 q10, q0, q8 - vmul.f32 q11, q1, q9 -1: vadd.f32 q12, q2, q10 - vadd.f32 q13, q3, q11 - pld [r1, #16] - pld [r2, #16] - pld [r3, #16] - subs r12, r12, #8 - beq 2f - vld1.32 {q0}, [r1,:128]! - vld1.32 {q8}, [r2,:128]! - vmul.f32 q10, q0, q8 - vld1.32 {q1}, [r1,:128]! - vld1.32 {q9}, [r2,:128]! - vmul.f32 q11, q1, q9 - vld1.32 {q2-q3}, [r3,:128]! - vst1.32 {q12-q13},[r0,:128]! - b 1b -2: vst1.32 {q12-q13},[r0,:128]! - bx lr - .endfunc - -function ff_vector_clipf_neon, export=1 -VFP vdup.32 q1, d0[1] -VFP vdup.32 q0, d0[0] -NOVFP vdup.32 q0, r2 -NOVFP vdup.32 q1, r3 -NOVFP ldr r2, [sp] - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 -1: vmax.f32 q8, q10, q0 - vmax.f32 q9, q11, q0 - subs r2, r2, #8 - beq 2f - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 - vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - b 1b -2: vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - bx lr - .endfunc diff -r 48be79afc72d -r b72bb442a775 arm/float_arm_vfp.c --- a/arm/float_arm_vfp.c Sun Oct 04 13:12:55 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_vector_fmul_vfp(float *dst, const float *src, int len); -void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, - const float *src1, int len); -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); - -void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul = ff_vector_fmul_vfp; - c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -#if HAVE_ARMV6 - c->float_to_int16 = ff_float_to_int16_vfp; -#endif -}