annotate arm/mdct_neon.S @ 12506:747e5f278c4b libavcodec

The debug text output of macroblocks can indicate MB_TYPE_INTERLACED, but it used to do it only for h264 codec. Allow it for other codecs, as mpeg2 and mpeg4 also set this flag.
author iive
date Tue, 21 Sep 2010 22:44:27 +0000
parents c80c7a717156
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
1 /*
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
2 * ARM NEON optimised MDCT
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
4 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
5 * This file is part of FFmpeg.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
6 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
11 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
15 * Lesser General Public License for more details.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
16 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
20 */
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
21
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
22 #include "asm.S"
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
23
11241
cbf3161706f4 ARM: add missing preserve8 directives
mru
parents: 10349
diff changeset
24 preserve8
cbf3161706f4 ARM: add missing preserve8 directives
mru
parents: 10349
diff changeset
25
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
26 .text
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
27
10346
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10206
diff changeset
28 #define ff_fft_calc_neon X(ff_fft_calc_neon)
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10206
diff changeset
29
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
30 function ff_imdct_half_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
31 push {r4-r8,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
32
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
33 mov r12, #1
12047
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
34 ldr lr, [r0, #20] @ mdct_bits
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
35 ldr r4, [r0, #24] @ tcos
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
36 ldr r3, [r0, #8] @ revtab
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
37 lsl r12, r12, lr @ n = 1 << nbits
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
38 lsr lr, r12, #2 @ n4 = n >> 2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
39 add r7, r2, r12, lsl #1
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
40 mov r12, #-16
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
41 sub r7, r7, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
42
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
43 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
44 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
45 vrev64.32 d17, d17
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
46 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
47 vmul.f32 d6, d17, d2
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
48 vmul.f32 d7, d0, d2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
49 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
50 subs lr, lr, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
51 ldr r6, [r3], #4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
52 vmul.f32 d4, d0, d3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
53 vmul.f32 d5, d17, d3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
54 vsub.f32 d4, d6, d4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
55 vadd.f32 d5, d5, d7
10172
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
56 uxth r8, r6, ror #16
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
57 uxth r6, r6
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
58 add r8, r1, r8, lsl #3
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
59 add r6, r1, r6, lsl #3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
60 beq 1f
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
61 vld2.32 {d16-d17},[r7,:128],r12
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
62 vld2.32 {d0-d1}, [r2,:128]!
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
63 vrev64.32 d17, d17
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
64 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
65 vmul.f32 d6, d17, d2
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
66 vmul.f32 d7, d0, d2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
67 vst2.32 {d4[0],d5[0]}, [r6,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
68 vst2.32 {d4[1],d5[1]}, [r8,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
69 b 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
70 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
71 vst2.32 {d4[0],d5[0]}, [r6,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
72 vst2.32 {d4[1],d5[1]}, [r8,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
73
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
74 mov r4, r0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
75 mov r6, r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
76 bl ff_fft_calc_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
77
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
78 mov r12, #1
12047
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
79 ldr lr, [r4, #20] @ mdct_bits
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
80 ldr r4, [r4, #24] @ tcos
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
81 lsl r12, r12, lr @ n = 1 << nbits
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
82 lsr lr, r12, #3 @ n8 = n >> 3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
83
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
84 add r4, r4, lr, lsl #3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
85 add r6, r6, lr, lsl #3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
86 sub r1, r4, #16
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
87 sub r3, r6, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
88
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
89 mov r7, #-16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
90 mov r8, r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
91 mov r0, r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
92
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
93 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
94 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
95 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
96 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
97 subs lr, lr, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
98 vmul.f32 d7, d0, d18
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
99 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
100 vmul.f32 d4, d1, d18
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
101 vmul.f32 d5, d21, d19
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
102 vmul.f32 d6, d20, d19
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
103 vmul.f32 d22, d1, d16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
104 vmul.f32 d23, d21, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
105 vmul.f32 d24, d0, d16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
106 vmul.f32 d25, d20, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
107 vadd.f32 d7, d7, d22
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
108 vadd.f32 d6, d6, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
109 vsub.f32 d4, d4, d24
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
110 vsub.f32 d5, d5, d25
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
111 beq 1f
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
112 vld2.32 {d0-d1}, [r3,:128], r7
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
113 vld2.32 {d20-d21},[r6,:128]!
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
114 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
115 vrev64.32 q3, q3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
116 vst2.32 {d4,d6}, [r0,:128], r7
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
117 vst2.32 {d5,d7}, [r8,:128]!
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
118 b 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
119 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
120 vrev64.32 q3, q3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
121 vst2.32 {d4,d6}, [r0,:128]
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
122 vst2.32 {d5,d7}, [r8,:128]
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
123
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
124 pop {r4-r8,pc}
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 11241
diff changeset
125 endfunc
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
126
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
127 function ff_imdct_calc_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
128 push {r4-r6,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
129
12047
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
130 ldr r3, [r0, #20]
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
131 mov r4, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
132 mov r5, r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
133 lsl r4, r4, r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
134 add r1, r1, r4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
135
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
136 bl ff_imdct_half_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
137
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
138 add r0, r5, r4, lsl #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
139 add r1, r5, r4, lsl #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
140 sub r0, r0, #8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
141 sub r2, r1, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
142 mov r3, #-16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
143 mov r6, #-8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
144 vmov.i32 d30, #1<<31
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
145 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
146 vld1.32 {d0-d1}, [r2,:128], r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
147 pld [r0, #-16]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
148 vrev64.32 q0, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
149 vld1.32 {d2-d3}, [r1,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
150 veor d4, d1, d30
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
151 pld [r2, #-16]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
152 vrev64.32 q1, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
153 veor d5, d0, d30
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
154 vst1.32 {d2}, [r0,:64], r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
155 vst1.32 {d3}, [r0,:64], r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
156 vst1.32 {d4-d5}, [r5,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
157 subs r4, r4, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
158 bgt 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
159
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
160 pop {r4-r6,pc}
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 11241
diff changeset
161 endfunc
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
163 function ff_mdct_calc_neon, export=1
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
164 push {r4-r10,lr}
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
165
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
166 mov r12, #1
12047
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
167 ldr lr, [r0, #20] @ mdct_bits
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
168 ldr r4, [r0, #24] @ tcos
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
169 ldr r3, [r0, #8] @ revtab
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
170 lsl lr, r12, lr @ n = 1 << nbits
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
171 add r7, r2, lr @ in4u
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
172 sub r9, r7, #16 @ in4d
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
173 add r2, r7, lr, lsl #1 @ in3u
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
174 add r8, r9, lr, lsl #1 @ in3d
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
175 add r5, r4, lr, lsl #1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
176 sub r5, r5, #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
177 sub r3, r3, #4
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
178 mov r12, #-16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
179
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
180 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
181 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
182 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
183 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
184 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
185 vsub.f32 d0, d18, d0 @ in4d-in4u I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
186 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
187 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
188 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
189 vadd.f32 d1, d1, d19 @ in3u+in3d -R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
190 vsub.f32 d16, d16, d2 @ in0u-in2d R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
191 vadd.f32 d17, d17, d3 @ in2u+in1d -I
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
192 1:
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
193 vmul.f32 d7, d0, d21 @ I*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
194 ldr r10, [r3, lr, lsr #1]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
195 vmul.f32 d6, d1, d20 @ -R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
196 ldr r6, [r3, #4]!
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
197 vmul.f32 d4, d1, d21 @ -R*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
198 vmul.f32 d5, d0, d20 @ I*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
199 vmul.f32 d24, d16, d30 @ R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
200 vmul.f32 d25, d17, d31 @ -I*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
201 vmul.f32 d22, d16, d31 @ R*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
202 vmul.f32 d23, d17, d30 @ I*c
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
203 subs lr, lr, #16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
204 vsub.f32 d6, d6, d7 @ -R*c-I*s
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
205 vadd.f32 d7, d4, d5 @ -R*s+I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
206 vsub.f32 d24, d25, d24 @ I*s-R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
207 vadd.f32 d25, d22, d23 @ R*s-I*c
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
208 beq 1f
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
209 mov r12, #-16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
210 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
211 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
212 vneg.f32 d7, d7 @ R*s-I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
213 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
214 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
215 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
216 vsub.f32 d0, d18, d0 @ in4d-in4u I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
217 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
218 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
219 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
220 vadd.f32 d1, d1, d19 @ in3u+in3d -R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
221 vsub.f32 d16, d16, d2 @ in0u-in2d R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
222 vadd.f32 d17, d17, d3 @ in2u+in1d -I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
223 uxth r12, r6, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
224 uxth r6, r6
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
225 add r12, r1, r12, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
226 add r6, r1, r6, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
227 vst2.32 {d6[0],d7[0]}, [r6,:64]
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
228 vst2.32 {d6[1],d7[1]}, [r12,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
229 uxth r6, r10, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
230 uxth r10, r10
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
231 add r6 , r1, r6, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
232 add r10, r1, r10, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
233 vst2.32 {d24[0],d25[0]},[r10,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
234 vst2.32 {d24[1],d25[1]},[r6,:64]
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
235 b 1b
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
236 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
237 vneg.f32 d7, d7 @ R*s-I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
238 uxth r12, r6, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
239 uxth r6, r6
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
240 add r12, r1, r12, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
241 add r6, r1, r6, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
242 vst2.32 {d6[0],d7[0]}, [r6,:64]
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
243 vst2.32 {d6[1],d7[1]}, [r12,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
244 uxth r6, r10, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
245 uxth r10, r10
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
246 add r6 , r1, r6, lsl #3
10172
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
247 add r10, r1, r10, lsl #3
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
248 vst2.32 {d24[0],d25[0]},[r10,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
249 vst2.32 {d24[1],d25[1]},[r6,:64]
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
250
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
251 mov r4, r0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
252 mov r6, r1
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
253 bl ff_fft_calc_neon
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
254
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
255 mov r12, #1
12047
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
256 ldr lr, [r4, #20] @ mdct_bits
c80c7a717156 Remove vestiges of radix-2 FFT
mru
parents: 11443
diff changeset
257 ldr r4, [r4, #24] @ tcos
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
258 lsl r12, r12, lr @ n = 1 << nbits
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
259 lsr lr, r12, #3 @ n8 = n >> 3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
260
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
261 add r4, r4, lr, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
262 add r6, r6, lr, lsl #3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
263 sub r1, r4, #16
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
264 sub r3, r6, #16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
265
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
266 mov r7, #-16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
267 mov r8, r6
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
268 mov r0, r3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
269
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
270 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
271 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
272 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
273 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
274 subs lr, lr, #2
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
275 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
276 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
277 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
278 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
279 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
280 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
281 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
282 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
283 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
284 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
285 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
286 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
287 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
288 vneg.f32 q2, q2
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
289 beq 1f
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
290 vld2.32 {d0-d1}, [r3,:128], r7
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
291 vld2.32 {d20-d21},[r6,:128]!
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
292 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
293 vrev64.32 q3, q3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
294 vst2.32 {d4,d6}, [r0,:128], r7
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
295 vst2.32 {d5,d7}, [r8,:128]!
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
296 b 1b
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
297 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
298 vrev64.32 q3, q3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
299 vst2.32 {d4,d6}, [r0,:128]
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
300 vst2.32 {d5,d7}, [r8,:128]
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
301
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
302 pop {r4-r10,pc}
11443
361a5fcb4393 ARM: set size of asm functions in object files
mru
parents: 11241
diff changeset
303 endfunc