view arm/vp56dsp_neon.S @ 12492:58a960d6e34c libavcodec

Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now coded in asm instead of C, this is (depending on the function) up to 50% faster for cases where gcc didn't do a great job at looping. Since h264_idct_add8() is now faster than the manual loop setup in h264.c, in-asm idct calling can now be enabled for chroma as well (see r16207). For MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author rbultje
date Tue, 14 Sep 2010 13:36:26 +0000
parents 1c6d78234e67
children
line wrap: on
line source

/*
 * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "asm.S"

.macro  vp6_edge_filter
        vdup.16         q3,  r2                 @ t
        vmov.i16        q13, #1
        vsubl.u8        q0,  d20, d18           @ p[   0] - p[-s]
        vsubl.u8        q1,  d16, d22           @ p[-2*s] - p[ s]
        vsubl.u8        q14, d21, d19
        vsubl.u8        q15, d17, d23
        vadd.i16        q2,  q0,  q0            @ 2*(p[0]-p[-s])
        vadd.i16        d29, d28, d28
        vadd.i16        q0,  q0,  q1            @    p[0]-p[-s]  + p[-2*s]-p[s]
        vadd.i16        d28, d28, d30
        vadd.i16        q0,  q0,  q2            @ 3*(p[0]-p[-s]) + p[-2*s]-p[s]
        vadd.i16        d28, d28, d29
        vrshr.s16       q0,  q0,  #3            @ v
        vrshr.s16       d28, d28, #3
        vsub.i16        q8,  q3,  q13           @ t-1
        vabs.s16        q1,  q0                 @ V
        vshr.s16        q2,  q0,  #15           @ s
        vabs.s16        d30, d28
        vshr.s16        d29, d28, #15
        vsub.i16        q12, q1,  q3            @ V-t
        vsub.i16        d31, d30, d6
        vsub.i16        q12, q12, q13           @ V-t-1
        vsub.i16        d31, d31, d26
        vcge.u16        q12, q12, q8            @ V-t-1 >= t-1
        vcge.u16        d31, d31, d16
        vadd.i16        q13, q3,  q3            @ 2*t
        vadd.i16        d16, d6,  d6
        vsub.i16        q13, q13, q1            @ 2*t - V
        vsub.i16        d16, d16, d30
        vadd.i16        q13, q13, q2            @ += s
        vadd.i16        d16, d16, d29
        veor            q13, q13, q2            @ ^= s
        veor            d16, d16, d29
        vbif            q0,  q13, q12
        vbif            d28, d16, d31
        vmovl.u8        q1,  d20
        vmovl.u8        q15, d21
        vaddw.u8        q2,  q0,  d18
        vaddw.u8        q3,  q14, d19
        vsub.i16        q1,  q1,  q0
        vsub.i16        d30, d30, d28
        vqmovun.s16     d18, q2
        vqmovun.s16     d19, q3
        vqmovun.s16     d20, q1
        vqmovun.s16     d21, q15
.endm

function ff_vp6_edge_filter_ver_neon, export=1
        sub             r0,  r0,  r1,  lsl #1
        vld1.8          {q8},     [r0], r1      @ p[-2*s]
        vld1.8          {q9},     [r0], r1      @ p[-s]
        vld1.8          {q10},    [r0], r1      @ p[0]
        vld1.8          {q11},    [r0]          @ p[s]
        vp6_edge_filter
        sub             r0,  r0,  r1,  lsl #1
        sub             r1,  r1,  #8
        vst1.8          {d18},    [r0]!
        vst1.32         {d19[0]}, [r0], r1
        vst1.8          {d20},    [r0]!
        vst1.32         {d21[0]}, [r0]
        bx              lr
endfunc

function ff_vp6_edge_filter_hor_neon, export=1
        sub             r3,  r0,  #1
        sub             r0,  r0,  #2
        vld1.32         {d16[0]}, [r0], r1
        vld1.32         {d18[0]}, [r0], r1
        vld1.32         {d20[0]}, [r0], r1
        vld1.32         {d22[0]}, [r0], r1
        vld1.32         {d16[1]}, [r0], r1
        vld1.32         {d18[1]}, [r0], r1
        vld1.32         {d20[1]}, [r0], r1
        vld1.32         {d22[1]}, [r0], r1
        vld1.32         {d17[0]}, [r0], r1
        vld1.32         {d19[0]}, [r0], r1
        vld1.32         {d21[0]}, [r0], r1
        vld1.32         {d23[0]}, [r0], r1
        vtrn.8          q8,  q9
        vtrn.8          q10, q11
        vtrn.16         q8,  q10
        vtrn.16         q9,  q11
        vp6_edge_filter
        vtrn.8          q9,  q10
        vst1.16         {d18[0]}, [r3], r1
        vst1.16         {d20[0]}, [r3], r1
        vst1.16         {d18[1]}, [r3], r1
        vst1.16         {d20[1]}, [r3], r1
        vst1.16         {d18[2]}, [r3], r1
        vst1.16         {d20[2]}, [r3], r1
        vst1.16         {d18[3]}, [r3], r1
        vst1.16         {d20[3]}, [r3], r1
        vst1.16         {d19[0]}, [r3], r1
        vst1.16         {d21[0]}, [r3], r1
        vst1.16         {d19[1]}, [r3], r1
        vst1.16         {d21[1]}, [r3], r1
        bx              lr
endfunc