Mercurial > libavcodec.hg
view arm/vp56dsp_neon.S @ 12255:7db147ea02c4 libavcodec
VP8: Inline traversing vp8_small_mvtree
Much faster read_mv_component, slightly faster overall
author | conrad |
---|---|
date | Fri, 23 Jul 2010 21:46:25 +0000 |
parents | 1c6d78234e67 |
children |
line wrap: on
line source
/* * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "asm.S" .macro vp6_edge_filter vdup.16 q3, r2 @ t vmov.i16 q13, #1 vsubl.u8 q0, d20, d18 @ p[ 0] - p[-s] vsubl.u8 q1, d16, d22 @ p[-2*s] - p[ s] vsubl.u8 q14, d21, d19 vsubl.u8 q15, d17, d23 vadd.i16 q2, q0, q0 @ 2*(p[0]-p[-s]) vadd.i16 d29, d28, d28 vadd.i16 q0, q0, q1 @ p[0]-p[-s] + p[-2*s]-p[s] vadd.i16 d28, d28, d30 vadd.i16 q0, q0, q2 @ 3*(p[0]-p[-s]) + p[-2*s]-p[s] vadd.i16 d28, d28, d29 vrshr.s16 q0, q0, #3 @ v vrshr.s16 d28, d28, #3 vsub.i16 q8, q3, q13 @ t-1 vabs.s16 q1, q0 @ V vshr.s16 q2, q0, #15 @ s vabs.s16 d30, d28 vshr.s16 d29, d28, #15 vsub.i16 q12, q1, q3 @ V-t vsub.i16 d31, d30, d6 vsub.i16 q12, q12, q13 @ V-t-1 vsub.i16 d31, d31, d26 vcge.u16 q12, q12, q8 @ V-t-1 >= t-1 vcge.u16 d31, d31, d16 vadd.i16 q13, q3, q3 @ 2*t vadd.i16 d16, d6, d6 vsub.i16 q13, q13, q1 @ 2*t - V vsub.i16 d16, d16, d30 vadd.i16 q13, q13, q2 @ += s vadd.i16 d16, d16, d29 veor q13, q13, q2 @ ^= s veor d16, d16, d29 vbif q0, q13, q12 vbif d28, d16, d31 vmovl.u8 q1, d20 vmovl.u8 q15, d21 vaddw.u8 q2, q0, d18 vaddw.u8 q3, q14, d19 vsub.i16 q1, q1, q0 vsub.i16 d30, d30, d28 vqmovun.s16 d18, q2 vqmovun.s16 d19, q3 vqmovun.s16 d20, q1 vqmovun.s16 d21, q15 .endm function ff_vp6_edge_filter_ver_neon, export=1 sub r0, r0, r1, lsl #1 vld1.8 {q8}, [r0], r1 @ p[-2*s] vld1.8 {q9}, [r0], r1 @ p[-s] vld1.8 {q10}, [r0], r1 @ p[0] vld1.8 {q11}, [r0] @ p[s] vp6_edge_filter sub r0, r0, r1, lsl #1 sub r1, r1, #8 vst1.8 {d18}, [r0]! vst1.32 {d19[0]}, [r0], r1 vst1.8 {d20}, [r0]! vst1.32 {d21[0]}, [r0] bx lr endfunc function ff_vp6_edge_filter_hor_neon, export=1 sub r3, r0, #1 sub r0, r0, #2 vld1.32 {d16[0]}, [r0], r1 vld1.32 {d18[0]}, [r0], r1 vld1.32 {d20[0]}, [r0], r1 vld1.32 {d22[0]}, [r0], r1 vld1.32 {d16[1]}, [r0], r1 vld1.32 {d18[1]}, [r0], r1 vld1.32 {d20[1]}, [r0], r1 vld1.32 {d22[1]}, [r0], r1 vld1.32 {d17[0]}, [r0], r1 vld1.32 {d19[0]}, [r0], r1 vld1.32 {d21[0]}, [r0], r1 vld1.32 {d23[0]}, [r0], r1 vtrn.8 q8, q9 vtrn.8 q10, q11 vtrn.16 q8, q10 vtrn.16 q9, q11 vp6_edge_filter vtrn.8 q9, q10 vst1.16 {d18[0]}, [r3], r1 vst1.16 {d20[0]}, [r3], r1 vst1.16 {d18[1]}, [r3], r1 vst1.16 {d20[1]}, [r3], r1 vst1.16 {d18[2]}, [r3], r1 vst1.16 {d20[2]}, [r3], r1 vst1.16 {d18[3]}, [r3], r1 vst1.16 {d20[3]}, [r3], r1 vst1.16 {d19[0]}, [r3], r1 vst1.16 {d21[0]}, [r3], r1 vst1.16 {d19[1]}, [r3], r1 vst1.16 {d21[1]}, [r3], r1 bx lr endfunc