annotate arm/h264idct_neon.S @ 8362:156137b60026 libavcodec

Split filter_mb_dir() out of filter_mb(). 1% overall decoding speed up for cathedral-beta2-400extra-crop-avc.mp4 no speed change for Aladin.mpg Benchmarks done on Pentium dual
author michael
date Wed, 17 Dec 2008 02:35:14 +0000
parents 9281a8a9387a
children 0ca0e3c98ed5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8339
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
1 /*
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
3 *
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
4 * This file is part of FFmpeg.
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
5 *
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
10 *
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
14 * Lesser General Public License for more details.
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
15 *
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
19 */
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
20
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
21 #include "asm.S"
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
22
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
23 .fpu neon
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
24
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
25 .text
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
26
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
27 function ff_h264_idct_add_neon, export=1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
28 mov r3, #(1<<5)
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
29 vmov.i16 d16, #0
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
30 vmov.16 d16[0], r3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
31 vld1.64 {d0-d3}, [r1,:128]
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
32 vadd.i16 d0, d0, d16
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
33
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
34 vswp d1, d2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
35 vadd.i16 d4, d0, d1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
36 vshr.s16 q8, q1, #1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
37 vsub.i16 d5, d0, d1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
38 vadd.i16 d6, d2, d17
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
39 vsub.i16 d7, d16, d3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
40 vadd.i16 q0, q2, q3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
41 vsub.i16 q1, q2, q3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
42
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
43 vtrn.16 d0, d1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
44 vtrn.16 d3, d2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
45 vtrn.32 d0, d3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
46 vtrn.32 d1, d2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
47
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
48 vadd.i16 d4, d0, d3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
49 vld1.32 {d18[0]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
50 vswp d1, d3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
51 vshr.s16 q8, q1, #1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
52 vld1.32 {d19[1]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
53 vsub.i16 d5, d0, d1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
54 vld1.32 {d18[1]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
55 vadd.i16 d6, d16, d3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
56 vld1.32 {d19[0]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
57 vsub.i16 d7, d2, d17
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
58 sub r0, r0, r2, lsl #2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
59 vadd.i16 q0, q2, q3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
60 vsub.i16 q1, q2, q3
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
61
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
62 vshr.s16 q0, q0, #6
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
63 vshr.s16 q1, q1, #6
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
64
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
65 vaddw.u8 q0, q0, d18
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
66 vaddw.u8 q1, q1, d19
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
67
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
68 vqmovun.s16 d0, q0
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
69 vqmovun.s16 d1, q1
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
70
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
71 vst1.32 {d0[0]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
72 vst1.32 {d1[1]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
73 vst1.32 {d0[1]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
74 vst1.32 {d1[0]}, [r0,:32], r2
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
75
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
76 bx lr
a561ec6d1bf6 ARM: NEON optimised h264_idct_add
mru
parents:
diff changeset
77 .endfunc
8340
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
78
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
79 function ff_h264_idct_dc_add_neon, export=1
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
80 vld1.16 {d2[],d3[]}, [r1,:16]
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
81 vrshr.s16 q1, q1, #6
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
82 vld1.32 {d0[0]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
83 vld1.32 {d0[1]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
84 vaddw.u8 q2, q1, d0
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
85 vld1.32 {d1[0]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
86 vld1.32 {d1[1]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
87 vaddw.u8 q1, q1, d1
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
88 vqmovun.s16 d0, q2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
89 vqmovun.s16 d1, q1
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
90 sub r0, r0, r2, lsl #2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
91 vst1.32 {d0[0]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
92 vst1.32 {d0[1]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
93 vst1.32 {d1[0]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
94 vst1.32 {d1[1]}, [r0,:32], r2
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
95 bx lr
834a77844ba3 ARM: NEON optimised h264_idct_dc_add
mru
parents: 8339
diff changeset
96 .endfunc