Mercurial > libavcodec.hg
annotate arm/mdct_neon.S @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | c80c7a717156 |
children |
rev | line source |
---|---|
10153 | 1 /* |
2 * ARM NEON optimised MDCT | |
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "asm.S" | |
23 | |
11241 | 24 preserve8 |
25 | |
10153 | 26 .text |
27 | |
10346 | 28 #define ff_fft_calc_neon X(ff_fft_calc_neon) |
29 | |
10153 | 30 function ff_imdct_half_neon, export=1 |
31 push {r4-r8,lr} | |
32 | |
33 mov r12, #1 | |
12047 | 34 ldr lr, [r0, #20] @ mdct_bits |
35 ldr r4, [r0, #24] @ tcos | |
10199 | 36 ldr r3, [r0, #8] @ revtab |
10153 | 37 lsl r12, r12, lr @ n = 1 << nbits |
38 lsr lr, r12, #2 @ n4 = n >> 2 | |
39 add r7, r2, r12, lsl #1 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
40 mov r12, #-16 |
10153 | 41 sub r7, r7, #16 |
42 | |
10160 | 43 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 |
44 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x | |
45 vrev64.32 d17, d17 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
46 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 |
10160 | 47 vmul.f32 d6, d17, d2 |
10153 | 48 vmul.f32 d7, d0, d2 |
49 1: | |
50 subs lr, lr, #2 | |
51 ldr r6, [r3], #4 | |
52 vmul.f32 d4, d0, d3 | |
10160 | 53 vmul.f32 d5, d17, d3 |
10153 | 54 vsub.f32 d4, d6, d4 |
55 vadd.f32 d5, d5, d7 | |
10172 | 56 uxth r8, r6, ror #16 |
57 uxth r6, r6 | |
58 add r8, r1, r8, lsl #3 | |
59 add r6, r1, r6, lsl #3 | |
10153 | 60 beq 1f |
10160 | 61 vld2.32 {d16-d17},[r7,:128],r12 |
62 vld2.32 {d0-d1}, [r2,:128]! | |
63 vrev64.32 d17, d17 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
64 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 |
10160 | 65 vmul.f32 d6, d17, d2 |
10153 | 66 vmul.f32 d7, d0, d2 |
67 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
68 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
69 b 1b | |
70 1: | |
71 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
72 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
73 | |
74 mov r4, r0 | |
75 mov r6, r1 | |
76 bl ff_fft_calc_neon | |
77 | |
78 mov r12, #1 | |
12047 | 79 ldr lr, [r4, #20] @ mdct_bits |
80 ldr r4, [r4, #24] @ tcos | |
10153 | 81 lsl r12, r12, lr @ n = 1 << nbits |
82 lsr lr, r12, #3 @ n8 = n >> 3 | |
83 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
84 add r4, r4, lr, lsl #3 |
10153 | 85 add r6, r6, lr, lsl #3 |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
86 sub r1, r4, #16 |
10153 | 87 sub r3, r6, #16 |
88 | |
89 mov r7, #-16 | |
90 mov r8, r6 | |
91 mov r0, r3 | |
92 | |
10160 | 93 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 |
94 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
95 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 |
10153 | 96 1: |
97 subs lr, lr, #2 | |
98 vmul.f32 d7, d0, d18 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
99 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 |
10153 | 100 vmul.f32 d4, d1, d18 |
101 vmul.f32 d5, d21, d19 | |
102 vmul.f32 d6, d20, d19 | |
103 vmul.f32 d22, d1, d16 | |
104 vmul.f32 d23, d21, d17 | |
105 vmul.f32 d24, d0, d16 | |
106 vmul.f32 d25, d20, d17 | |
107 vadd.f32 d7, d7, d22 | |
108 vadd.f32 d6, d6, d23 | |
109 vsub.f32 d4, d4, d24 | |
110 vsub.f32 d5, d5, d25 | |
111 beq 1f | |
10160 | 112 vld2.32 {d0-d1}, [r3,:128], r7 |
113 vld2.32 {d20-d21},[r6,:128]! | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
114 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 |
10153 | 115 vrev64.32 q3, q3 |
10160 | 116 vst2.32 {d4,d6}, [r0,:128], r7 |
117 vst2.32 {d5,d7}, [r8,:128]! | |
10153 | 118 b 1b |
119 1: | |
120 vrev64.32 q3, q3 | |
10160 | 121 vst2.32 {d4,d6}, [r0,:128] |
122 vst2.32 {d5,d7}, [r8,:128] | |
10153 | 123 |
124 pop {r4-r8,pc} | |
11443 | 125 endfunc |
10153 | 126 |
127 function ff_imdct_calc_neon, export=1 | |
128 push {r4-r6,lr} | |
129 | |
12047 | 130 ldr r3, [r0, #20] |
10153 | 131 mov r4, #1 |
132 mov r5, r1 | |
133 lsl r4, r4, r3 | |
134 add r1, r1, r4 | |
135 | |
136 bl ff_imdct_half_neon | |
137 | |
138 add r0, r5, r4, lsl #2 | |
139 add r1, r5, r4, lsl #1 | |
140 sub r0, r0, #8 | |
141 sub r2, r1, #16 | |
142 mov r3, #-16 | |
143 mov r6, #-8 | |
144 vmov.i32 d30, #1<<31 | |
145 1: | |
146 vld1.32 {d0-d1}, [r2,:128], r3 | |
147 pld [r0, #-16] | |
148 vrev64.32 q0, q0 | |
149 vld1.32 {d2-d3}, [r1,:128]! | |
150 veor d4, d1, d30 | |
151 pld [r2, #-16] | |
152 vrev64.32 q1, q1 | |
153 veor d5, d0, d30 | |
154 vst1.32 {d2}, [r0,:64], r6 | |
155 vst1.32 {d3}, [r0,:64], r6 | |
156 vst1.32 {d4-d5}, [r5,:128]! | |
157 subs r4, r4, #16 | |
158 bgt 1b | |
159 | |
160 pop {r4-r6,pc} | |
11443 | 161 endfunc |
10162 | 162 |
163 function ff_mdct_calc_neon, export=1 | |
164 push {r4-r10,lr} | |
165 | |
166 mov r12, #1 | |
12047 | 167 ldr lr, [r0, #20] @ mdct_bits |
168 ldr r4, [r0, #24] @ tcos | |
10199 | 169 ldr r3, [r0, #8] @ revtab |
10162 | 170 lsl lr, r12, lr @ n = 1 << nbits |
171 add r7, r2, lr @ in4u | |
172 sub r9, r7, #16 @ in4d | |
173 add r2, r7, lr, lsl #1 @ in3u | |
174 add r8, r9, lr, lsl #1 @ in3d | |
10206 | 175 add r5, r4, lr, lsl #1 |
176 sub r5, r5, #16 | |
177 sub r3, r3, #4 | |
10162 | 178 mov r12, #-16 |
179 | |
10206 | 180 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 |
181 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 | |
182 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 | |
10162 | 183 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
10206 | 184 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 |
185 vsub.f32 d0, d18, d0 @ in4d-in4u I | |
186 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 | |
187 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 | |
188 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 | |
189 vadd.f32 d1, d1, d19 @ in3u+in3d -R | |
190 vsub.f32 d16, d16, d2 @ in0u-in2d R | |
191 vadd.f32 d17, d17, d3 @ in2u+in1d -I | |
10162 | 192 1: |
10206 | 193 vmul.f32 d7, d0, d21 @ I*s |
194 ldr r10, [r3, lr, lsr #1] | |
195 vmul.f32 d6, d1, d20 @ -R*c | |
196 ldr r6, [r3, #4]! | |
197 vmul.f32 d4, d1, d21 @ -R*s | |
198 vmul.f32 d5, d0, d20 @ I*c | |
199 vmul.f32 d24, d16, d30 @ R*c | |
200 vmul.f32 d25, d17, d31 @ -I*s | |
201 vmul.f32 d22, d16, d31 @ R*s | |
202 vmul.f32 d23, d17, d30 @ I*c | |
10162 | 203 subs lr, lr, #16 |
204 vsub.f32 d6, d6, d7 @ -R*c-I*s | |
205 vadd.f32 d7, d4, d5 @ -R*s+I*c | |
10206 | 206 vsub.f32 d24, d25, d24 @ I*s-R*c |
207 vadd.f32 d25, d22, d23 @ R*s-I*c | |
10162 | 208 beq 1f |
10206 | 209 mov r12, #-16 |
210 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 | |
211 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 | |
10162 | 212 vneg.f32 d7, d7 @ R*s-I*c |
10206 | 213 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 |
10162 | 214 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
10206 | 215 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 |
216 vsub.f32 d0, d18, d0 @ in4d-in4u I | |
217 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 | |
218 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 | |
219 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 | |
220 vadd.f32 d1, d1, d19 @ in3u+in3d -R | |
221 vsub.f32 d16, d16, d2 @ in0u-in2d R | |
222 vadd.f32 d17, d17, d3 @ in2u+in1d -I | |
223 uxth r12, r6, ror #16 | |
224 uxth r6, r6 | |
225 add r12, r1, r12, lsl #3 | |
226 add r6, r1, r6, lsl #3 | |
10162 | 227 vst2.32 {d6[0],d7[0]}, [r6,:64] |
10206 | 228 vst2.32 {d6[1],d7[1]}, [r12,:64] |
229 uxth r6, r10, ror #16 | |
230 uxth r10, r10 | |
231 add r6 , r1, r6, lsl #3 | |
232 add r10, r1, r10, lsl #3 | |
233 vst2.32 {d24[0],d25[0]},[r10,:64] | |
234 vst2.32 {d24[1],d25[1]},[r6,:64] | |
10162 | 235 b 1b |
236 1: | |
237 vneg.f32 d7, d7 @ R*s-I*c | |
10206 | 238 uxth r12, r6, ror #16 |
239 uxth r6, r6 | |
240 add r12, r1, r12, lsl #3 | |
241 add r6, r1, r6, lsl #3 | |
10162 | 242 vst2.32 {d6[0],d7[0]}, [r6,:64] |
10206 | 243 vst2.32 {d6[1],d7[1]}, [r12,:64] |
244 uxth r6, r10, ror #16 | |
245 uxth r10, r10 | |
246 add r6 , r1, r6, lsl #3 | |
10172 | 247 add r10, r1, r10, lsl #3 |
10206 | 248 vst2.32 {d24[0],d25[0]},[r10,:64] |
249 vst2.32 {d24[1],d25[1]},[r6,:64] | |
10162 | 250 |
251 mov r4, r0 | |
252 mov r6, r1 | |
253 bl ff_fft_calc_neon | |
254 | |
255 mov r12, #1 | |
12047 | 256 ldr lr, [r4, #20] @ mdct_bits |
257 ldr r4, [r4, #24] @ tcos | |
10162 | 258 lsl r12, r12, lr @ n = 1 << nbits |
259 lsr lr, r12, #3 @ n8 = n >> 3 | |
260 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
261 add r4, r4, lr, lsl #3 |
10162 | 262 add r6, r6, lr, lsl #3 |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
263 sub r1, r4, #16 |
10162 | 264 sub r3, r6, #16 |
265 | |
266 mov r7, #-16 | |
267 mov r8, r6 | |
268 mov r0, r3 | |
269 | |
270 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 | |
271 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
272 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 |
10162 | 273 1: |
274 subs lr, lr, #2 | |
275 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
276 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 |
10162 | 277 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 |
278 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 | |
279 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 | |
280 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 | |
281 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 | |
282 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 | |
283 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 | |
284 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 | |
285 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 | |
286 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 | |
287 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 | |
288 vneg.f32 q2, q2 | |
289 beq 1f | |
290 vld2.32 {d0-d1}, [r3,:128], r7 | |
291 vld2.32 {d20-d21},[r6,:128]! | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
292 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 |
10162 | 293 vrev64.32 q3, q3 |
294 vst2.32 {d4,d6}, [r0,:128], r7 | |
295 vst2.32 {d5,d7}, [r8,:128]! | |
296 b 1b | |
297 1: | |
298 vrev64.32 q3, q3 | |
299 vst2.32 {d4,d6}, [r0,:128] | |
300 vst2.32 {d5,d7}, [r8,:128] | |
301 | |
302 pop {r4-r10,pc} | |
11443 | 303 endfunc |