Mercurial > libavcodec.hg
view arm/mpegvideo_armv5te_s.S @ 10952:ea8f891d997d libavcodec
H264 DXVA2 implementation
It allows VLD H264 decoding using DXVA2 (GPU assisted decoding API under
VISTA and Windows 7).
It is implemented by using AVHWAccel API. It has been tested successfully
for some time in VLC using an nvidia card on Windows 7.
To compile it, you need to have the system header dxva2api.h (either from
microsoft or using http://downloads.videolan.org/pub/videolan/testing/contrib/dxva2api.h)
The generated libavcodec.dll does not depend directly on any new lib as
the necessary objects are given by the application using FFmpeg.
author | fenrir |
---|---|
date | Wed, 20 Jan 2010 18:54:51 +0000 |
parents | 9281a8a9387a |
children | 361a5fcb4393 |
line wrap: on
line source
/* * Optimization of some functions from mpegvideo.c for armv5te * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include "asm.S" /* * Special optimized version of dct_unquantize_h263_helper_c, it * requires the block to be at least 8 bytes aligned, and may process * more elements than requested. But it is guaranteed to never * process more than 64 elements provided that count argument is <= 64, * so it is safe. This function is optimized for a common distribution * of values for nCoeffs (they are mostly multiple of 8 plus one or * two extra elements). So this function processes data as 8 elements * per loop iteration and contains optional 2 elements processing in * the end. * * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) */ function ff_dct_unquantize_h263_armv5te, export=1 push {r4-r9,lr} mov ip, #0 subs r3, r3, #2 ble 2f ldrd r4, [r0, #0] 1: ldrd r6, [r0, #8] rsbs r9, ip, r4, asr #16 addgt r9, r2, #0 rsblt r9, r2, #0 smlatbne r9, r4, r1, r9 rsbs lr, ip, r5, asr #16 addgt lr, r2, #0 rsblt lr, r2, #0 smlatbne lr, r5, r1, lr rsbs r8, ip, r4, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r4, r4, r1, r8 rsbs r8, ip, r5, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r5, r5, r1, r8 strh r4, [r0], #2 strh r9, [r0], #2 strh r5, [r0], #2 strh lr, [r0], #2 rsbs r9, ip, r6, asr #16 addgt r9, r2, #0 rsblt r9, r2, #0 smlatbne r9, r6, r1, r9 rsbs lr, ip, r7, asr #16 addgt lr, r2, #0 rsblt lr, r2, #0 smlatbne lr, r7, r1, lr rsbs r8, ip, r6, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r6, r6, r1, r8 rsbs r8, ip, r7, asl #16 addgt r8, r2, #0 rsblt r8, r2, #0 smlabbne r7, r7, r1, r8 strh r6, [r0], #2 strh r9, [r0], #2 strh r7, [r0], #2 strh lr, [r0], #2 subs r3, r3, #8 ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ bgt 1b adds r3, r3, #2 pople {r4-r9,pc} 2: ldrsh r9, [r0, #0] ldrsh lr, [r0, #2] mov r8, r2 cmp r9, #0 rsblt r8, r2, #0 smlabbne r9, r9, r1, r8 mov r8, r2 cmp lr, #0 rsblt r8, r2, #0 smlabbne lr, lr, r1, r8 strh r9, [r0], #2 strh lr, [r0], #2 pop {r4-r9,pc} .endfunc