Mercurial > libavcodec.hg
view alpha/motion_est_mvi_asm.S @ 10852:86d7ab878805 libavcodec
Get rid of #include "svq3.c"
functions called more than per mb are moved into the header, scan8 is also
as it must be known at compiletime.
The code after this patch duplicates h264data.h, this has been done to minimize
the changes in this step and allow more fine grained benchmarking.
Speedwise this is 1% faster on my pentium dual core with diegos cursed cathedral
sample.
author | michael |
---|---|
date | Tue, 12 Jan 2010 05:30:31 +0000 |
parents | 6f1b210e58d1 |
children | 58c2da0a371b |
line wrap: on
line source
/* * Alpha optimized DSP utils * Copyright (c) 2002 Falk Hueffner <falk@debian.org> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "regdef.h" /* Some nicer register names. */ #define ta t10 #define tb t11 #define tc t12 #define td AT /* Danger: these overlap with the argument list and the return value */ #define te a5 #define tf a4 #define tg a3 #define th v0 .set noat .set noreorder .arch pca56 .text /***************************************************************************** * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size) * * This code is written with a pca56 in mind. For ev6, one should * really take the increased latency of 3 cycles for MVI instructions * into account. * * It is important to keep the loading and first use of a register as * far apart as possible, because if a register is accessed before it * has been fetched from memory, the CPU will stall. */ .align 4 .globl pix_abs16x16_mvi_asm .ent pix_abs16x16_mvi_asm pix_abs16x16_mvi_asm: .frame sp, 0, ra, 0 .prologue 0 #if CONFIG_GPROF lda AT, _mcount jsr AT, (AT), _mcount #endif and a2, 7, t0 clr v0 beq t0, $aligned .align 4 $unaligned: /* Registers: line 0: t0: left_u -> left lo -> left t1: mid t2: right_u -> right hi -> right t3: ref left t4: ref right line 1: t5: left_u -> left lo -> left t6: mid t7: right_u -> right hi -> right t8: ref left t9: ref right temp: ta: left hi tb: right lo tc: error left td: error right */ /* load line 0 */ ldq_u t0, 0(a2) # left_u ldq_u t1, 8(a2) # mid ldq_u t2, 16(a2) # right_u ldq t3, 0(a1) # ref left ldq t4, 8(a1) # ref right addq a1, a3, a1 # pix1 addq a2, a3, a2 # pix2 /* load line 1 */ ldq_u t5, 0(a2) # left_u ldq_u t6, 8(a2) # mid ldq_u t7, 16(a2) # right_u ldq t8, 0(a1) # ref left ldq t9, 8(a1) # ref right addq a1, a3, a1 # pix1 addq a2, a3, a2 # pix2 /* calc line 0 */ extql t0, a2, t0 # left lo extqh t1, a2, ta # left hi extql t1, a2, tb # right lo or t0, ta, t0 # left extqh t2, a2, t2 # right hi perr t3, t0, tc # error left or t2, tb, t2 # right perr t4, t2, td # error right addq v0, tc, v0 # add error left addq v0, td, v0 # add error left /* calc line 1 */ extql t5, a2, t5 # left lo extqh t6, a2, ta # left hi extql t6, a2, tb # right lo or t5, ta, t5 # left extqh t7, a2, t7 # right hi perr t8, t5, tc # error left or t7, tb, t7 # right perr t9, t7, td # error right addq v0, tc, v0 # add error left addq v0, td, v0 # add error left /* loop */ subq a4, 2, a4 # h -= 2 bne a4, $unaligned ret .align 4 $aligned: /* load line 0 */ ldq t0, 0(a2) # left ldq t1, 8(a2) # right addq a2, a3, a2 # pix2 ldq t2, 0(a1) # ref left ldq t3, 8(a1) # ref right addq a1, a3, a1 # pix1 /* load line 1 */ ldq t4, 0(a2) # left ldq t5, 8(a2) # right addq a2, a3, a2 # pix2 ldq t6, 0(a1) # ref left ldq t7, 8(a1) # ref right addq a1, a3, a1 # pix1 /* load line 2 */ ldq t8, 0(a2) # left ldq t9, 8(a2) # right addq a2, a3, a2 # pix2 ldq ta, 0(a1) # ref left ldq tb, 8(a1) # ref right addq a1, a3, a1 # pix1 /* load line 3 */ ldq tc, 0(a2) # left ldq td, 8(a2) # right addq a2, a3, a2 # pix2 ldq te, 0(a1) # ref left ldq a0, 8(a1) # ref right /* calc line 0 */ perr t0, t2, t0 # error left addq a1, a3, a1 # pix1 perr t1, t3, t1 # error right addq v0, t0, v0 # add error left /* calc line 1 */ perr t4, t6, t0 # error left addq v0, t1, v0 # add error right perr t5, t7, t1 # error right addq v0, t0, v0 # add error left /* calc line 2 */ perr t8, ta, t0 # error left addq v0, t1, v0 # add error right perr t9, tb, t1 # error right addq v0, t0, v0 # add error left /* calc line 3 */ perr tc, te, t0 # error left addq v0, t1, v0 # add error right perr td, a0, t1 # error right addq v0, t0, v0 # add error left addq v0, t1, v0 # add error right /* loop */ subq a4, 4, a4 # h -= 4 bne a4, $aligned ret .end pix_abs16x16_mvi_asm