changeset 8462:0ca0e3c98ed5 libavcodec

ARM: add new h264 idct functions
author mru
date Thu, 25 Dec 2008 23:13:43 +0000
parents 11307ea31e57
children 2ba4e13aa21a
files arm/dsputil_neon.c arm/h264idct_neon.S
diffstat 2 files changed, 105 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/arm/dsputil_neon.c	Thu Dec 25 18:27:49 2008 +0000
+++ b/arm/dsputil_neon.c	Thu Dec 25 23:13:43 2008 +0000
@@ -94,6 +94,15 @@
 
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             DCTELEM *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  DCTELEM *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            DCTELEM *block, int stride,
+                            const uint8_t nnzc[6*8]);
 
 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
@@ -166,4 +175,7 @@
 
     c->h264_idct_add = ff_h264_idct_add_neon;
     c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+    c->h264_idct_add16      = ff_h264_idct_add16_neon;
+    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+    c->h264_idct_add8       = ff_h264_idct_add8_neon;
 }
--- a/arm/h264idct_neon.S	Thu Dec 25 18:27:49 2008 +0000
+++ b/arm/h264idct_neon.S	Thu Dec 25 23:13:43 2008 +0000
@@ -20,6 +20,7 @@
 
 #include "asm.S"
 
+        preserve8
         .fpu neon
 
         .text
@@ -94,3 +95,95 @@
         vst1.32         {d1[1]},  [r0,:32], r2
         bx              lr
         .endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movw            r7,  #:lower16:scan8
+        movt            r7,  #:upper16:scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        subs            r8,  r8,  #1
+        blt             2f
+        ldrsh           lr,  [r1]
+        add             r0,  r0,  r4
+        movne           lr,  #0
+        cmp             lr,  #0
+        adrne           lr,  ff_h264_idct_dc_add_neon
+        adreq           lr,  ff_h264_idct_add_neon
+        blx             lr
+2:      subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+        .endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        push            {r4-r8,lr}
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r1,  r2
+        mov             r2,  r3
+        ldr             r6,  [sp, #24]
+        movw            r7,  #:lower16:scan8
+        movt            r7,  #:upper16:scan8
+        mov             ip,  #16
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        add             r0,  r0,  r4
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r8,pc}
+        .endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        push            {r4-r10,lr}
+        ldm             r0,  {r4,r9}
+        add             r5,  r1,  #16*4
+        add             r1,  r2,  #16*32
+        mov             r2,  r3
+        ldr             r6,  [sp, #32]
+        movw            r7,  #:lower16:scan8+16
+        movt            r7,  #:upper16:scan8+16
+        mov             ip,  #8
+1:      ldrb            r8,  [r7], #1
+        ldr             r0,  [r5], #4
+        ldrb            r8,  [r6, r8]
+        tst             ip,  #4
+        addeq           r0,  r0,  r4
+        addne           r0,  r0,  r9
+        cmp             r8,  #0
+        ldrsh           r8,  [r1]
+        adrne           lr,  ff_h264_idct_add_neon
+        adreq           lr,  ff_h264_idct_dc_add_neon
+        cmpeq           r8,  #0
+        blxne           lr
+        subs            ip,  ip,  #1
+        add             r1,  r1,  #32
+        bne             1b
+        pop             {r4-r10,pc}
+        .endfunc
+
+        .section .rodata
+scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
+        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
+        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
+        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
+        .byte           1+1*8, 2+1*8
+        .byte           1+2*8, 2+2*8
+        .byte           1+4*8, 2+4*8
+        .byte           1+5*8, 2+5*8