changeset 11797:77243f47f39e libavcodec

ARM: NEON optimised dct_unquantize_h263_{intra,inter}
author mru
date Sat, 29 May 2010 15:29:40 +0000
parents f917cd286e41
children 46a7546ad744
files arm/Makefile arm/mpegvideo_arm.c arm/mpegvideo_neon.S
diffstat 3 files changed, 123 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/arm/Makefile	Sat May 29 07:28:24 2010 +0000
+++ b/arm/Makefile	Sat May 29 15:29:40 2010 +0000
@@ -49,5 +49,6 @@
 OBJS-$(HAVE_NEON)                      += arm/dsputil_init_neon.o       \
                                           arm/dsputil_neon.o            \
                                           arm/int_neon.o                \
+                                          arm/mpegvideo_neon.o          \
                                           arm/simple_idct_neon.o        \
                                           $(NEON-OBJS-yes)
--- a/arm/mpegvideo_arm.c	Sat May 29 07:28:24 2010 +0000
+++ b/arm/mpegvideo_arm.c	Sat May 29 15:29:40 2010 +0000
@@ -23,6 +23,11 @@
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideo_arm.h"
 
+void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, DCTELEM *block,
+                                       int n, int qscale);
+void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, DCTELEM *block,
+                                       int n, int qscale);
+
 void MPV_common_init_arm(MpegEncContext *s)
 {
     /* IWMMXT support is a superset of armv5te, so
@@ -35,4 +40,9 @@
 #if HAVE_IWMMXT
     MPV_common_init_iwmmxt(s);
 #endif
+
+    if (HAVE_NEON) {
+        s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
+        s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
+    }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/arm/mpegvideo_neon.S	Sat May 29 15:29:40 2010 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define Y_DC_SCALE               0xa54
+#define C_DC_SCALE               0xa58
+#define AC_PRED                  0xa80
+#define BLOCK_LAST_INDEX         0x2278
+#define INTER_SCANTAB_RASTER_END 0x2478
+#define H263_AIC                 0x2728
+
+function ff_dct_unquantize_h263_inter_neon, export=1
+        add             r0,  r0,  #0x2200
+        add             r12, r0,  #BLOCK_LAST_INDEX-0x2200
+        ldr             r12, [r12, r2, lsl #2]
+        add             r0,  r0,  #INTER_SCANTAB_RASTER_END-0x2200
+        ldrb            r12, [r0, r12]
+        sub             r2,  r3,  #1
+        lsl             r0,  r3,  #1
+        orr             r2,  r2,  #1
+        add             r3,  r12, #1
+endfunc
+
+function ff_dct_unquantize_h263_neon, export=1
+        vdup.16         q15, r0                 @ qmul
+        vdup.16         q14, r2                 @ qadd
+        vneg.s16        q13, q14
+        cmp             r3,  #4
+        mov             r0,  r1
+        ble             2f
+1:
+        vld1.16         {q0},     [r0,:128]!
+        vclt.s16        q3,  q0,  #0
+        vld1.16         {q8},     [r0,:128]!
+        vceq.s16        q1,  q0,  #0
+        vmul.s16        q2,  q0,  q15
+        vclt.s16        q11, q8,  #0
+        vmul.s16        q10, q8,  q15
+        vbsl            q3,  q13, q14
+        vbsl            q11, q13, q14
+        vadd.s16        q2,  q2,  q3
+        vceq.s16        q9,  q8,  #0
+        vadd.s16        q10, q10, q11
+        vbif            q0,  q2,  q1
+        vbif            q8,  q10, q9
+        subs            r3,  r3,  #16
+        vst1.16         {q0},     [r1,:128]!
+        vst1.16         {q8},     [r1,:128]!
+        bxle            lr
+        cmp             r3,  #8
+        bgt             1b
+2:
+        vld1.16         {d0},     [r0,:64]
+        vclt.s16        d3,  d0,  #0
+        vceq.s16        d1,  d0,  #0
+        vmul.s16        d2,  d0,  d30
+        vbsl            d3,  d26, d28
+        vadd.s16        d2,  d2,  d3
+        vbif            d0,  d2,  d1
+        vst1.16         {d0},     [r1,:64]
+        bx              lr
+endfunc
+
+function ff_dct_unquantize_h263_intra_neon, export=1
+        push            {r4-r6,lr}
+        ldr             r6,  [r0, #AC_PRED]
+        add             r5,  r0,  #0x2700
+        cmp             r6,  #0
+        movne           r12, #63
+        bne             1f
+        add             lr,  r0,  #0x2200
+        add             r12, lr,  #BLOCK_LAST_INDEX-0x2200
+        add             lr,  lr,  #INTER_SCANTAB_RASTER_END-0x2200
+        ldr             r12, [r12, r2, lsl #2]
+        ldrb            r12, [lr, r12]
+1:      ldr             r5,  [r5, #H263_AIC-0x2700]
+        ldrsh           r4,  [r1]
+        cmp             r5,  #0
+        mov             r5,  r1
+        movne           r2,  #0
+        bne             2f
+        cmp             r2,  #4
+        addge           r0,  r0,  #4
+        sub             r2,  r3,  #1
+        ldr             r6,  [r0, #Y_DC_SCALE]
+        orr             r2,  r2,  #1
+        smulbb          r4,  r4,  r6
+2:      lsl             r0,  r3,  #1
+        add             r3,  r12, #1
+        bl              ff_dct_unquantize_h263_neon
+        vmov.16         d0[0], r4
+        vst1.16         {d0[0]},  [r5]
+        pop             {r4-r6,pc}
+endfunc