Mercurial > libavcodec.hg
view arm/mpegvideo_armv5te_s.S @ 11864:7204cb7dd601 libavcodec
Quant changes only once per MB so move the corresponding scale factor assignment
out of the block decoding loop. Indeo4 doesn't use any scale table but the quant
level itself as scale. Therefore access scale table only if its pointer != NULL.
author | maxim |
---|---|
date | Thu, 10 Jun 2010 17:31:12 +0000 |
parents | 361a5fcb4393 |
children |
line wrap: on
line source
/* * Optimization of some functions from mpegvideo.c for armv5te * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include "asm.S" /* * Special optimized version of dct_unquantize_h263_helper_c, it * requires the block to be at least 8 bytes aligned, and may process * more elements than requested. But it is guaranteed to never * process more than 64 elements provided that count argument is <= 64, * so it is safe. This function is optimized for a common distribution * of values for nCoeffs (they are mostly multiple of 8 plus one or * two extra elements). So this function processes data as 8 elements * per loop iteration and contains optional 2 elements processing in * the end. * * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) */ function ff_dct_unquantize_h263_armv5te, export=1 push {r4-r9,lr} mov ip, #0 subs r3, r3, #2 ble 2f ldrd r4, [r0, #0] 1: ldrd r6, [r0, #8] rsbs r9, ip, r4, asr #16 addgt r9, r2, #0 rsblt r9, r2, #0 smlatbne r9, r4, r1, r9 rsbs lr, ip, r5, asr #16 addgt lr, r2, #0 rsblt lr, r2, #0 smlatbne lr, r5, r1, lr rsbs r8, ip, r4, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r4, r4, r1, r8 rsbs r8, ip, r5, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r5, r5, r1, r8 strh r4, [r0], #2 strh r9, [r0], #2 strh r5, [r0], #2 strh lr, [r0], #2 rsbs r9, ip, r6, asr #16 addgt r9, r2, #0 rsblt r9, r2, #0 smlatbne r9, r6, r1, r9 rsbs lr, ip, r7, asr #16 addgt lr, r2, #0 rsblt lr, r2, #0 smlatbne lr, r7, r1, lr rsbs r8, ip, r6, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r6, r6, r1, r8 rsbs r8, ip, r7, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r7, r7, r1, r8 strh r6, [r0], #2 strh r9, [r0], #2 strh r7, [r0], #2 strh lr, [r0], #2 subs r3, r3, #8 ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ bgt 1b adds r3, r3, #2 pople {r4-r9,pc} 2: ldrsh r9, [r0, #0] ldrsh lr, [r0, #2] mov r8, r2 cmp r9, #0 rsblt r8, r2, #0 smlabbne r9, r9, r1, r8 mov r8, r2 cmp lr, #0 rsblt r8, r2, #0 smlabbne lr, lr, r1, r8 strh r9, [r0], #2 strh lr, [r0], #2 pop {r4-r9,pc} endfunc