annotate arm/fft_neon.S @ 11032:01bd040f8607 libavcodec

Unroll main loop so the edge==0 case is seperate. This allows many things to be simplified away. h264 decoder is overall 1% faster with a mbaff sample and 0.1% slower with the cathedral sample, probably because the slow loop filter code must be loaded into the code cache for each first MB of each row but isnt used for the following MBs.
author michael
date Thu, 28 Jan 2010 01:24:25 +0000
parents f12b7ea2df2a
children 361a5fcb4393
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
1 /*
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
2 * ARM NEON optimised FFT
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
3 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
5 * Copyright (c) 2009 Naotoshi Nojiri
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
6 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
7 * This file is part of FFmpeg.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
8 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
9 * FFmpeg is free software; you can redistribute it and/or
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
10 * modify it under the terms of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
11 * License as published by the Free Software Foundation; either
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
12 * version 2.1 of the License, or (at your option) any later version.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
13 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
14 * FFmpeg is distributed in the hope that it will be useful,
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
17 * Lesser General Public License for more details.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
18 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
19 * You should have received a copy of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
20 * License along with FFmpeg; if not, write to the Free Software
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
22 */
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
24 #include "asm.S"
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
25
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
26 #define M_SQRT1_2 0.70710678118654752440
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
27
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
28 .text
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
29
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
30 function fft4_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
31 vld1.32 {d0-d3}, [r0,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
32
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
33 vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
34 vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
35 vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
36 vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
37 vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
38 vadd.f32 d1, d6, d7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
39 vsub.f32 d3, d6, d7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
40 vadd.f32 d0, d4, d5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
41 vsub.f32 d2, d4, d5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
42
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
43 vst1.32 {d0-d3}, [r0,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
44
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
45 bx lr
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
46 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
47
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
48 function fft8_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
49 mov r1, r0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
50 vld1.32 {d0-d3}, [r1,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
51 vld1.32 {d16-d19}, [r1,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
52
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
53 movw r2, #0x04f3 @ sqrt(1/2)
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
54 movt r2, #0x3f35
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
55 eor r3, r2, #1<<31
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
56 vdup.32 d31, r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
57
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
58 vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
59 vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
60 vmov d28, r3, r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
61 vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
62 vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
63 vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
64 vrev64.32 d29, d28
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
65 vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
66 vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
67 vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
68 vext.32 q3, q2, q2, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
69 vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
70 vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
71 vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
72 vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
73 vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
74 vadd.f32 d0, d20, d21
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
75 vsub.f32 d2, d20, d21
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
76 vadd.f32 d1, d22, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
77 vrev64.32 q13, q13
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
78 vsub.f32 d3, d22, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
79 vsub.f32 d6, d6, d7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
80 vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
81 vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
82 vadd.f32 d7, d4, d5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
83 vsub.f32 d18, d2, d6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
84 vext.32 q13, q12, q12, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
85 vadd.f32 d2, d2, d6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
86 vsub.f32 d16, d0, d7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
87 vadd.f32 d5, d25, d24
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
88 vsub.f32 d4, d26, d27
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
89 vadd.f32 d0, d0, d7
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
90 vsub.f32 d17, d1, d5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
91 vsub.f32 d19, d3, d4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
92 vadd.f32 d3, d3, d4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
93 vadd.f32 d1, d1, d5
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
94
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
95 vst1.32 {d16-d19}, [r1,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
96 vst1.32 {d0-d3}, [r0,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
97
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
98 bx lr
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
99 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
100
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
101 function fft16_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
102 movrel r1, mppm
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
103 vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
104 pld [r0, #32]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
105 vld1.32 {d2-d3}, [r1,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
106 vext.32 q13, q9, q9, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
107 vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
108 vadd.f32 d4, d16, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
109 vsub.f32 d5, d16, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
110 vadd.f32 d18, d18, d19
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
111 vsub.f32 d19, d26, d27
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
112
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
113 vadd.f32 d20, d22, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
114 vsub.f32 d22, d22, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
115 vsub.f32 d23, d24, d25
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
116 vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
117 vadd.f32 d21, d24, d25
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
118 vmul.f32 d24, d22, d2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
119 vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
120 vmul.f32 d25, d23, d3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
121 vuzp.32 d16, d17 @ {r0,r1,i0,i1}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
122 vmul.f32 q1, q11, d2[1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
123 vuzp.32 d18, d19 @ {r2,r3,i2,i3}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
124 vrev64.32 q12, q12
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
125 vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
126 vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
127 vzip.32 q10, q11
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
128 vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
129 vadd.f32 d0, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
130 vadd.f32 d1, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
131 vsub.f32 d2, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
132 vsub.f32 d3, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
133 sub r0, r0, #96
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
134 vext.32 q13, q13, q13, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
135 vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
136 vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
137 vext.32 q15, q15, q15, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
138 vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
139 vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
140 vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
141 vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
142 vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
143 vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
10346
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10172
diff changeset
144 movrel r2, X(ff_cos_16)
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
145 vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
146 vrev64.32 d1, d1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
147 vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
148 vrev64.32 d3, d3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
149 movrel r3, pmmp
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
150 vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
151 vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
152 vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
153 vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
154 vld1.32 {d4-d5}, [r2,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
155 vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
156 vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
157 vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
158 vld1.32 {d6-d7}, [r3,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
159 vrev64.32 q1, q14
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
160 vmul.f32 q14, q14, d4[1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
161 vmul.f32 q1, q1, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
162 vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
163 vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
164 vzip.32 q12, q14
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
165 vadd.f32 d0, d28, d24
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
166 vadd.f32 d1, d25, d29
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
167 vsub.f32 d2, d25, d29
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
168 vsub.f32 d3, d28, d24
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
169 vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
170 vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
171 vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
172 mov r1, #32
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
173 vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
174 vrev64.32 q0, q13
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
175 vmul.f32 q13, q13, d5[0]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
176 vrev64.32 q1, q15
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
177 vmul.f32 q15, q15, d5[1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
178 vst2.32 {d16-d17},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
179 vmul.f32 q0, q0, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
180 vst2.32 {d20-d21},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
181 vmul.f32 q1, q1, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
182 vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
183 vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
184 vst2.32 {d24-d25},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
185 vst2.32 {d28-d29},[r0,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
186 vzip.32 q13, q15
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
187 sub r0, r0, #80
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
188 vadd.f32 d0, d30, d26
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
189 vadd.f32 d1, d27, d31
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
190 vsub.f32 d2, d27, d31
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
191 vsub.f32 d3, d30, d26
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
192 vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
193 vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
194 vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
195 vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
196 vst2.32 {d18-d19},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
197 vst2.32 {d22-d23},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
198 vst2.32 {d26-d27},[r0,:128], r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
199 vst2.32 {d30-d31},[r0,:128]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
200 bx lr
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
201 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
202
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
203 function fft_pass_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
204 push {r4-r6,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
205 mov r6, r2 @ n
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
206 lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
207 lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
208 lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
209 add r3, r2, r4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
210 add r4, r4, r0 @ &z[o1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
211 add r2, r2, r0 @ &z[o2]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
212 add r3, r3, r0 @ &z[o3]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
213 vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
214 movrel r12, pmmp
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
215 vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
216 add r5, r5, r1 @ wim
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
217 vld1.32 {d6-d7}, [r12,:128] @ pmmp
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
218 vswp d21, d22
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
219 vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
220 sub r5, r5, #4 @ wim--
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
221 vrev64.32 q1, q11
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
222 vmul.f32 q11, q11, d4[1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
223 vmul.f32 q1, q1, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
224 vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
225 vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
226 vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
227 sub r6, r6, #1 @ n--
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
228 vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
229 vzip.32 q10, q11
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
230 vadd.f32 d0, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
231 vadd.f32 d1, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
232 vsub.f32 d2, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
233 vsub.f32 d3, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
234 vsub.f32 q10, q8, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
235 vadd.f32 q8, q8, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
236 vsub.f32 q11, q9, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
237 vadd.f32 q9, q9, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
238 vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
239 vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
240 vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
241 vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
242 sub r5, r5, #8 @ wim -= 2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
243 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
244 vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
245 vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
246 vswp d21, d22
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
247 vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
248 vrev64.32 q0, q10
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
249 vmul.f32 q10, q10, d4[0]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
250 vrev64.32 q1, q11
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
251 vmul.f32 q11, q11, d4[1]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
252 vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
253 vmul.f32 q0, q0, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
254 sub r5, r5, #8 @ wim -= 2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
255 vmul.f32 q1, q1, q3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
256 vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
257 vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
258 vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
259 subs r6, r6, #1 @ n--
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
260 vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
261 vzip.32 q10, q11
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
262 vadd.f32 d0, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
263 vadd.f32 d1, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
264 vsub.f32 d2, d21, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
265 vsub.f32 d3, d22, d20
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
266 vsub.f32 q10, q8, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
267 vadd.f32 q8, q8, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
268 vsub.f32 q11, q9, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
269 vadd.f32 q9, q9, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
270 vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
271 vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
272 vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
273 vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
274 bne 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
275
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
276 pop {r4-r6,pc}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
277 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
278
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
279 .macro def_fft n, n2, n4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
280 .align 6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
281 function fft\n\()_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
282 push {r4, lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
283 mov r4, r0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
284 bl fft\n2\()_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
285 add r0, r4, #\n4*2*8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
286 bl fft\n4\()_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
287 add r0, r4, #\n4*3*8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
288 bl fft\n4\()_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
289 mov r0, r4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
290 pop {r4, lr}
10346
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10172
diff changeset
291 movrel r1, X(ff_cos_\n)
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
292 mov r2, #\n4/2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
293 b fft_pass_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
294 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
295 .endm
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
296
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
297 def_fft 32, 16, 8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
298 def_fft 64, 32, 16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
299 def_fft 128, 64, 32
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
300 def_fft 256, 128, 64
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
301 def_fft 512, 256, 128
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
302 def_fft 1024, 512, 256
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
303 def_fft 2048, 1024, 512
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
304 def_fft 4096, 2048, 1024
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
305 def_fft 8192, 4096, 2048
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
306 def_fft 16384, 8192, 4096
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
307 def_fft 32768, 16384, 8192
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
308 def_fft 65536, 32768, 16384
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
309
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
310 function ff_fft_calc_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
311 ldr r2, [r0]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
312 sub r2, r2, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
313 movrel r3, fft_tab_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
314 ldr r3, [r3, r2, lsl #2]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
315 mov r0, r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
316 bx r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
317 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
318
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
319 function ff_fft_permute_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
320 push {r4,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
321 mov r12, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
322 ldr r2, [r0] @ nbits
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
323 ldr r3, [r0, #20] @ tmp_buf
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
324 ldr r0, [r0, #8] @ revtab
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
325 lsl r12, r12, r2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
326 mov r2, r12
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
327 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
328 vld1.32 {d0-d1}, [r1,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
329 ldr r4, [r0], #4
10172
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10153
diff changeset
330 uxth lr, r4
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10153
diff changeset
331 uxth r4, r4, ror #16
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10153
diff changeset
332 add lr, r3, lr, lsl #3
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10153
diff changeset
333 add r4, r3, r4, lsl #3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
334 vst1.32 {d0}, [lr,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
335 vst1.32 {d1}, [r4,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
336 subs r12, r12, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
337 bgt 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
338
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
339 sub r1, r1, r2, lsl #3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
340 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
341 vld1.32 {d0-d3}, [r3,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
342 vst1.32 {d0-d3}, [r1,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
343 subs r2, r2, #4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
344 bgt 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
345
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
346 pop {r4,pc}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
347 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
348
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
349 .section .rodata
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
350 .align 4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
351 fft_tab_neon:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
352 .word fft4_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
353 .word fft8_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
354 .word fft16_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
355 .word fft32_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
356 .word fft64_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
357 .word fft128_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
358 .word fft256_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
359 .word fft512_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
360 .word fft1024_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
361 .word fft2048_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
362 .word fft4096_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
363 .word fft8192_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
364 .word fft16384_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
365 .word fft32768_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
366 .word fft65536_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
367 .size fft_tab_neon, . - fft_tab_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
368
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
369 .align 4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
370 pmmp: .float +1.0, -1.0, -1.0, +1.0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
371 mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2