view arm/vp56dsp_neon.S @ 12255:7db147ea02c4 libavcodec

VP8: Inline traversing vp8_small_mvtree Much faster read_mv_component, slightly faster overall
author conrad
date Fri, 23 Jul 2010 21:46:25 +0000
parents 1c6d78234e67
children
line wrap: on
line source

/*
 * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "asm.S"

.macro  vp6_edge_filter
        vdup.16         q3,  r2                 @ t
        vmov.i16        q13, #1
        vsubl.u8        q0,  d20, d18           @ p[   0] - p[-s]
        vsubl.u8        q1,  d16, d22           @ p[-2*s] - p[ s]
        vsubl.u8        q14, d21, d19
        vsubl.u8        q15, d17, d23
        vadd.i16        q2,  q0,  q0            @ 2*(p[0]-p[-s])
        vadd.i16        d29, d28, d28
        vadd.i16        q0,  q0,  q1            @    p[0]-p[-s]  + p[-2*s]-p[s]
        vadd.i16        d28, d28, d30
        vadd.i16        q0,  q0,  q2            @ 3*(p[0]-p[-s]) + p[-2*s]-p[s]
        vadd.i16        d28, d28, d29
        vrshr.s16       q0,  q0,  #3            @ v
        vrshr.s16       d28, d28, #3
        vsub.i16        q8,  q3,  q13           @ t-1
        vabs.s16        q1,  q0                 @ V
        vshr.s16        q2,  q0,  #15           @ s
        vabs.s16        d30, d28
        vshr.s16        d29, d28, #15
        vsub.i16        q12, q1,  q3            @ V-t
        vsub.i16        d31, d30, d6
        vsub.i16        q12, q12, q13           @ V-t-1
        vsub.i16        d31, d31, d26
        vcge.u16        q12, q12, q8            @ V-t-1 >= t-1
        vcge.u16        d31, d31, d16
        vadd.i16        q13, q3,  q3            @ 2*t
        vadd.i16        d16, d6,  d6
        vsub.i16        q13, q13, q1            @ 2*t - V
        vsub.i16        d16, d16, d30
        vadd.i16        q13, q13, q2            @ += s
        vadd.i16        d16, d16, d29
        veor            q13, q13, q2            @ ^= s
        veor            d16, d16, d29
        vbif            q0,  q13, q12
        vbif            d28, d16, d31
        vmovl.u8        q1,  d20
        vmovl.u8        q15, d21
        vaddw.u8        q2,  q0,  d18
        vaddw.u8        q3,  q14, d19
        vsub.i16        q1,  q1,  q0
        vsub.i16        d30, d30, d28
        vqmovun.s16     d18, q2
        vqmovun.s16     d19, q3
        vqmovun.s16     d20, q1
        vqmovun.s16     d21, q15
.endm

function ff_vp6_edge_filter_ver_neon, export=1
        sub             r0,  r0,  r1,  lsl #1
        vld1.8          {q8},     [r0], r1      @ p[-2*s]
        vld1.8          {q9},     [r0], r1      @ p[-s]
        vld1.8          {q10},    [r0], r1      @ p[0]
        vld1.8          {q11},    [r0]          @ p[s]
        vp6_edge_filter
        sub             r0,  r0,  r1,  lsl #1
        sub             r1,  r1,  #8
        vst1.8          {d18},    [r0]!
        vst1.32         {d19[0]}, [r0], r1
        vst1.8          {d20},    [r0]!
        vst1.32         {d21[0]}, [r0]
        bx              lr
endfunc

function ff_vp6_edge_filter_hor_neon, export=1
        sub             r3,  r0,  #1
        sub             r0,  r0,  #2
        vld1.32         {d16[0]}, [r0], r1
        vld1.32         {d18[0]}, [r0], r1
        vld1.32         {d20[0]}, [r0], r1
        vld1.32         {d22[0]}, [r0], r1
        vld1.32         {d16[1]}, [r0], r1
        vld1.32         {d18[1]}, [r0], r1
        vld1.32         {d20[1]}, [r0], r1
        vld1.32         {d22[1]}, [r0], r1
        vld1.32         {d17[0]}, [r0], r1
        vld1.32         {d19[0]}, [r0], r1
        vld1.32         {d21[0]}, [r0], r1
        vld1.32         {d23[0]}, [r0], r1
        vtrn.8          q8,  q9
        vtrn.8          q10, q11
        vtrn.16         q8,  q10
        vtrn.16         q9,  q11
        vp6_edge_filter
        vtrn.8          q9,  q10
        vst1.16         {d18[0]}, [r3], r1
        vst1.16         {d20[0]}, [r3], r1
        vst1.16         {d18[1]}, [r3], r1
        vst1.16         {d20[1]}, [r3], r1
        vst1.16         {d18[2]}, [r3], r1
        vst1.16         {d20[2]}, [r3], r1
        vst1.16         {d18[3]}, [r3], r1
        vst1.16         {d20[3]}, [r3], r1
        vst1.16         {d19[0]}, [r3], r1
        vst1.16         {d21[0]}, [r3], r1
        vst1.16         {d19[1]}, [r3], r1
        vst1.16         {d21[1]}, [r3], r1
        bx              lr
endfunc