changeset 9693:feaf99ca99a6 libavcodec

ARM: actually add VP3 loop filter
author conrad
date Sat, 23 May 2009 18:47:26 +0000
parents 9d103a3236e0
children 39f6d1f21ef8
files arm/vp3dsp_neon.S
diffstat 1 files changed, 94 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/arm/vp3dsp_neon.S	Sat May 23 18:47:26 2009 +0000
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.macro vp3_loop_filter
+    vsubl.u8        q3,  d18, d17
+    vsubl.u8        q2,  d16, d19
+    vadd.i16        q1,  q3,  q3
+    vadd.i16        q2,  q2,  q3
+    vadd.i16        q0,  q1,  q2
+    vrshr.s16       q0,  q0,  #3
+    vmovl.u8        q9,  d18
+    vdup.u16        q15, r2
+
+    vabs.s16        q1,  q0
+    vshr.s16        q0,  q0,  #15
+    vqsub.u16       q2,  q15, q1
+    vqsub.u16       q3,  q2,  q1
+    vsub.i16        q1,  q2,  q3
+    veor            q1,  q1,  q0
+    vsub.i16        q0,  q1,  q0
+
+    vaddw.u8        q2,  q0,  d17
+    vsub.i16        q3,  q9,  q0
+    vqmovun.s16     d0,  q2
+    vqmovun.s16     d1,  q3
+.endm
+
+function ff_vp3_v_loop_filter_neon, export=1
+    sub             ip,  r0,  r1
+    sub             r0,  r0,  r1,  lsl #1
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d17}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vld1.64         {d19}, [r0,:64], r1
+    ldrb            r2,    [r2, #129*4]
+
+    vp3_loop_filter
+
+    vst1.64         {d0},  [ip,:64], r1
+    vst1.64         {d1},  [ip,:64], r1
+    bx              lr
+.endfunc
+
+function ff_vp3_h_loop_filter_neon, export=1
+    sub             ip,  r0,  #1
+    sub             r0,  r0,  #2
+    vld1.32         {d16[]},  [r0], r1
+    vld1.32         {d17[]},  [r0], r1
+    vld1.32         {d18[]},  [r0], r1
+    vld1.32         {d19[]},  [r0], r1
+    vld1.32         {d16[1]}, [r0], r1
+    vld1.32         {d17[1]}, [r0], r1
+    vld1.32         {d18[1]}, [r0], r1
+    vld1.32         {d19[1]}, [r0], r1
+    ldrb            r2,  [r2, #129*4]
+
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+    vtrn.16         d16, d18
+    vtrn.16         d17, d19
+
+    vp3_loop_filter
+
+    vtrn.8          d0,  d1
+
+    vst1.16         {d0[0]}, [ip], r1
+    vst1.16         {d1[0]}, [ip], r1
+    vst1.16         {d0[1]}, [ip], r1
+    vst1.16         {d1[1]}, [ip], r1
+    vst1.16         {d0[2]}, [ip], r1
+    vst1.16         {d1[2]}, [ip], r1
+    vst1.16         {d0[3]}, [ip], r1
+    vst1.16         {d1[3]}, [ip], r1
+    bx              lr
+.endfunc