Mercurial > libavcodec.hg
view arm/dsputil_vfp.S @ 11352:6e0af2cfdcfe libavcodec
Do MC and IDCT in coding (hilbert) order
This increases the slice size to 64 pixels, due to having to decode an
entire chroma superblock row per slice.
This can be up to 6% slower depending on clip and CPU, but is necessary
for future optimizations that gain significantly more than was lost.
author | conrad |
---|---|
date | Wed, 03 Mar 2010 23:27:40 +0000 |
parents | bdcc1c52f223 |
children | 361a5fcb4393 |
line wrap: on
line source
/* * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include "asm.S" .syntax unified /* * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle * throughput for almost all the instructions (except for double precision * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles * for arithmetic operations. Scheduling code to avoid pipeline stalls is very * important for performance. One more interesting feature is that VFP has * independent load/store and arithmetics pipelines, so it is possible to make * them work simultaneously and get more than 1 operation per cycle. Load/store * pipeline can process 2 single precision floating point values per cycle and * supports bulk loads and stores for large sets of registers. Arithmetic operations * can be done on vectors, which allows to keep the arithmetics pipeline busy, * while the processor may issue and execute other instructions. Detailed * optimization manuals can be found at http://www.arm.com */ /** * ARM VFP optimized implementation of 'vector_fmul_c' function. * Assume that len is a positive number and is multiple of 8 */ @ void ff_vector_fmul_vfp(float *dst, const float *src, int len) function ff_vector_fmul_vfp, export=1 vpush {d8-d15} mov r3, r0 fmrx r12, fpscr orr r12, r12, #(3 << 16) /* set vector size to 4 */ fmxr fpscr, r12 vldmia r3!, {s0-s3} vldmia r1!, {s8-s11} vldmia r3!, {s4-s7} vldmia r1!, {s12-s15} vmul.f32 s8, s0, s8 1: subs r2, r2, #16 vmul.f32 s12, s4, s12 vldmiage r3!, {s16-s19} vldmiage r1!, {s24-s27} vldmiage r3!, {s20-s23} vldmiage r1!, {s28-s31} vmulge.f32 s24, s16, s24 vstmia r0!, {s8-s11} vstmia r0!, {s12-s15} vmulge.f32 s28, s20, s28 vldmiagt r3!, {s0-s3} vldmiagt r1!, {s8-s11} vldmiagt r3!, {s4-s7} vldmiagt r1!, {s12-s15} vmulge.f32 s8, s0, s8 vstmiage r0!, {s24-s27} vstmiage r0!, {s28-s31} bgt 1b bic r12, r12, #(7 << 16) /* set vector size back to 1 */ fmxr fpscr, r12 vpop {d8-d15} bx lr .endfunc /** * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. * Assume that len is a positive number and is multiple of 8 */ @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, @ const float *src1, int len) function ff_vector_fmul_reverse_vfp, export=1 vpush {d8-d15} add r2, r2, r3, lsl #2 vldmdb r2!, {s0-s3} vldmia r1!, {s8-s11} vldmdb r2!, {s4-s7} vldmia r1!, {s12-s15} vmul.f32 s8, s3, s8 vmul.f32 s9, s2, s9 vmul.f32 s10, s1, s10 vmul.f32 s11, s0, s11 1: subs r3, r3, #16 vldmdbge r2!, {s16-s19} vmul.f32 s12, s7, s12 vldmiage r1!, {s24-s27} vmul.f32 s13, s6, s13 vldmdbge r2!, {s20-s23} vmul.f32 s14, s5, s14 vldmiage r1!, {s28-s31} vmul.f32 s15, s4, s15 vmulge.f32 s24, s19, s24 vldmdbgt r2!, {s0-s3} vmulge.f32 s25, s18, s25 vstmia r0!, {s8-s13} vmulge.f32 s26, s17, s26 vldmiagt r1!, {s8-s11} vmulge.f32 s27, s16, s27 vmulge.f32 s28, s23, s28 vldmdbgt r2!, {s4-s7} vmulge.f32 s29, s22, s29 vstmia r0!, {s14-s15} vmulge.f32 s30, s21, s30 vmulge.f32 s31, s20, s31 vmulge.f32 s8, s3, s8 vldmiagt r1!, {s12-s15} vmulge.f32 s9, s2, s9 vmulge.f32 s10, s1, s10 vstmiage r0!, {s24-s27} vmulge.f32 s11, s0, s11 vstmiage r0!, {s28-s31} bgt 1b vpop {d8-d15} bx lr .endfunc #if HAVE_ARMV6 /** * ARM VFP optimized float to int16 conversion. * Assume that len is a positive number and is multiple of 8, destination * buffer is at least 4 bytes aligned (8 bytes alignment is better for * performance), little endian byte sex */ @ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) function ff_float_to_int16_vfp, export=1 push {r4-r8,lr} vpush {d8-d11} vldmia r1!, {s16-s23} vcvt.s32.f32 s0, s16 vcvt.s32.f32 s1, s17 vcvt.s32.f32 s2, s18 vcvt.s32.f32 s3, s19 vcvt.s32.f32 s4, s20 vcvt.s32.f32 s5, s21 vcvt.s32.f32 s6, s22 vcvt.s32.f32 s7, s23 1: subs r2, r2, #8 vmov r3, r4, s0, s1 vmov r5, r6, s2, s3 vmov r7, r8, s4, s5 vmov ip, lr, s6, s7 vldmiagt r1!, {s16-s23} ssat r4, #16, r4 ssat r3, #16, r3 ssat r6, #16, r6 ssat r5, #16, r5 pkhbt r3, r3, r4, lsl #16 pkhbt r4, r5, r6, lsl #16 vcvtgt.s32.f32 s0, s16 vcvtgt.s32.f32 s1, s17 vcvtgt.s32.f32 s2, s18 vcvtgt.s32.f32 s3, s19 vcvtgt.s32.f32 s4, s20 vcvtgt.s32.f32 s5, s21 vcvtgt.s32.f32 s6, s22 vcvtgt.s32.f32 s7, s23 ssat r8, #16, r8 ssat r7, #16, r7 ssat lr, #16, lr ssat ip, #16, ip pkhbt r5, r7, r8, lsl #16 pkhbt r6, ip, lr, lsl #16 stmia r0!, {r3-r6} bgt 1b vpop {d8-d11} pop {r4-r8,pc} .endfunc #endif