comparison arm/mdct_neon.S @ 10153:7a63015e4627 libavcodec

ARM: NEON optimised FFT and MDCT Vorbis and AC3 ~3x faster. Parts by Naotoshi Nojiri, naonoj gmail
author mru
date Thu, 10 Sep 2009 08:50:03 +0000
parents
children 75bab19c59a2
comparison
equal deleted inserted replaced
10152:ed85bbd5dccb 10153:7a63015e4627
1 /*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 .fpu neon
25 .text
26
27 function ff_imdct_half_neon, export=1
28 push {r4-r8,lr}
29
30 mov r12, #1
31 ldr lr, [r0, #4] @ nbits
32 ldr r4, [r0, #8] @ tcos
33 ldr r5, [r0, #12] @ tsin
34 ldr r3, [r0, #24] @ revtab
35 lsl r12, r12, lr @ n = 1 << nbits
36 lsr lr, r12, #2 @ n4 = n >> 2
37 add r7, r2, r12, lsl #1
38 mov r12, #-16
39 sub r7, r7, #16
40
41 vld1.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
42 vld1.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
43 vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
44 vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
45 vuzp.32 d17, d16
46 vuzp.32 d0, d1
47 vmul.f32 d6, d16, d2
48 vmul.f32 d7, d0, d2
49 1:
50 subs lr, lr, #2
51 ldr r6, [r3], #4
52 vmul.f32 d4, d0, d3
53 vmul.f32 d5, d16, d3
54 vsub.f32 d4, d6, d4
55 vadd.f32 d5, d5, d7
56 uxtah r8, r1, r6, ror #16
57 uxtah r6, r1, r6
58 beq 1f
59 vld1.32 {d16-d17},[r7,:128],r12
60 vld1.32 {d0-d1}, [r2,:128]!
61 vuzp.32 d17, d16
62 vld1.32 {d2}, [r4,:64]!
63 vuzp.32 d0, d1
64 vmul.f32 d6, d16, d2
65 vld1.32 {d3}, [r5,:64]!
66 vmul.f32 d7, d0, d2
67 vst2.32 {d4[0],d5[0]}, [r6,:64]
68 vst2.32 {d4[1],d5[1]}, [r8,:64]
69 b 1b
70 1:
71 vst2.32 {d4[0],d5[0]}, [r6,:64]
72 vst2.32 {d4[1],d5[1]}, [r8,:64]
73
74 mov r4, r0
75 mov r6, r1
76 add r0, r0, #16
77 bl ff_fft_calc_neon
78
79 mov r12, #1
80 ldr lr, [r4, #4] @ nbits
81 ldr r5, [r4, #12] @ tsin
82 ldr r4, [r4, #8] @ tcos
83 lsl r12, r12, lr @ n = 1 << nbits
84 lsr lr, r12, #3 @ n8 = n >> 3
85
86 add r4, r4, lr, lsl #2
87 add r5, r5, lr, lsl #2
88 add r6, r6, lr, lsl #3
89 sub r1, r4, #8
90 sub r2, r5, #8
91 sub r3, r6, #16
92
93 mov r7, #-16
94 mov r12, #-8
95 mov r8, r6
96 mov r0, r3
97
98 vld1.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
99 vld1.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
100 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
101 vuzp.32 d20, d21
102 vuzp.32 d0, d1
103 1:
104 subs lr, lr, #2
105 vmul.f32 d7, d0, d18
106 vld1.32 {d19}, [r5,:64]! @ d19=s2,s3
107 vmul.f32 d4, d1, d18
108 vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0
109 vmul.f32 d5, d21, d19
110 vld1.32 {d17}, [r4,:64]! @ d17=c2,c3
111 vmul.f32 d6, d20, d19
112 vmul.f32 d22, d1, d16
113 vmul.f32 d23, d21, d17
114 vmul.f32 d24, d0, d16
115 vmul.f32 d25, d20, d17
116 vadd.f32 d7, d7, d22
117 vadd.f32 d6, d6, d23
118 vsub.f32 d4, d4, d24
119 vsub.f32 d5, d5, d25
120 beq 1f
121 vld1.32 {d0-d1}, [r3,:128], r7
122 vld1.32 {d20-d21},[r6,:128]!
123 vld1.32 {d18}, [r2,:64], r12
124 vuzp.32 d20, d21
125 vuzp.32 d0, d1
126 vrev64.32 q3, q3
127 vtrn.32 d4, d6
128 vtrn.32 d5, d7
129 vswp d5, d6
130 vst1.32 {d4-d5}, [r0,:128], r7
131 vst1.32 {d6-d7}, [r8,:128]!
132 b 1b
133 1:
134 vrev64.32 q3, q3
135 vtrn.32 d4, d6
136 vtrn.32 d5, d7
137 vswp d5, d6
138 vst1.32 {d4-d5}, [r0,:128]
139 vst1.32 {d6-d7}, [r8,:128]
140
141 pop {r4-r8,pc}
142 .endfunc
143
144 function ff_imdct_calc_neon, export=1
145 push {r4-r6,lr}
146
147 ldr r3, [r0, #4]
148 mov r4, #1
149 mov r5, r1
150 lsl r4, r4, r3
151 add r1, r1, r4
152
153 bl ff_imdct_half_neon
154
155 add r0, r5, r4, lsl #2
156 add r1, r5, r4, lsl #1
157 sub r0, r0, #8
158 sub r2, r1, #16
159 mov r3, #-16
160 mov r6, #-8
161 vmov.i32 d30, #1<<31
162 1:
163 vld1.32 {d0-d1}, [r2,:128], r3
164 pld [r0, #-16]
165 vrev64.32 q0, q0
166 vld1.32 {d2-d3}, [r1,:128]!
167 veor d4, d1, d30
168 pld [r2, #-16]
169 vrev64.32 q1, q1
170 veor d5, d0, d30
171 vst1.32 {d2}, [r0,:64], r6
172 vst1.32 {d3}, [r0,:64], r6
173 vst1.32 {d4-d5}, [r5,:128]!
174 subs r4, r4, #16
175 bgt 1b
176
177 pop {r4-r6,pc}
178 .endfunc