annotate arm/mdct_neon.S @ 11034:fd5921186064 libavcodec

Make the fast loop filter path work with unavailable left MBs. This prevents the issue with having to switch between slow and fast code paths in each row. 0.5% faster loopfilter for cathedral
author michael
date Thu, 28 Jan 2010 02:15:25 +0000
parents be725249ea67
children cbf3161706f4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
1 /*
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
2 * ARM NEON optimised MDCT
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
4 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
5 * This file is part of FFmpeg.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
6 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
7 * FFmpeg is free software; you can redistribute it and/or
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
8 * modify it under the terms of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
9 * License as published by the Free Software Foundation; either
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
10 * version 2.1 of the License, or (at your option) any later version.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
11 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
12 * FFmpeg is distributed in the hope that it will be useful,
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
15 * Lesser General Public License for more details.
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
16 *
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
17 * You should have received a copy of the GNU Lesser General Public
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
18 * License along with FFmpeg; if not, write to the Free Software
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
20 */
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
21
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
22 #include "asm.S"
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
24 .text
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
25
10346
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10206
diff changeset
26 #define ff_fft_calc_neon X(ff_fft_calc_neon)
f12b7ea2df2a ARM: apply extern symbol prefix where needed
mru
parents: 10206
diff changeset
27
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
28 function ff_imdct_half_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
29 push {r4-r8,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
30
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
31 mov r12, #1
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
32 ldr lr, [r0, #28] @ mdct_bits
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
33 ldr r4, [r0, #32] @ tcos
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
34 ldr r3, [r0, #8] @ revtab
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
35 lsl r12, r12, lr @ n = 1 << nbits
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
36 lsr lr, r12, #2 @ n4 = n >> 2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
37 add r7, r2, r12, lsl #1
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
38 mov r12, #-16
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
39 sub r7, r7, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
40
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
41 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
42 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
43 vrev64.32 d17, d17
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
44 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
45 vmul.f32 d6, d17, d2
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
46 vmul.f32 d7, d0, d2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
47 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
48 subs lr, lr, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
49 ldr r6, [r3], #4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
50 vmul.f32 d4, d0, d3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
51 vmul.f32 d5, d17, d3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
52 vsub.f32 d4, d6, d4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
53 vadd.f32 d5, d5, d7
10172
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
54 uxth r8, r6, ror #16
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
55 uxth r6, r6
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
56 add r8, r1, r8, lsl #3
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
57 add r6, r1, r6, lsl #3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
58 beq 1f
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
59 vld2.32 {d16-d17},[r7,:128],r12
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
60 vld2.32 {d0-d1}, [r2,:128]!
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
61 vrev64.32 d17, d17
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
62 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
63 vmul.f32 d6, d17, d2
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
64 vmul.f32 d7, d0, d2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
65 vst2.32 {d4[0],d5[0]}, [r6,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
66 vst2.32 {d4[1],d5[1]}, [r8,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
67 b 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
68 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
69 vst2.32 {d4[0],d5[0]}, [r6,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
70 vst2.32 {d4[1],d5[1]}, [r8,:64]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
71
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
72 mov r4, r0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
73 mov r6, r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
74 bl ff_fft_calc_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
75
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
76 mov r12, #1
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
77 ldr lr, [r4, #28] @ mdct_bits
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
78 ldr r4, [r4, #32] @ tcos
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
79 lsl r12, r12, lr @ n = 1 << nbits
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
80 lsr lr, r12, #3 @ n8 = n >> 3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
81
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
82 add r4, r4, lr, lsl #3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
83 add r6, r6, lr, lsl #3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
84 sub r1, r4, #16
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
85 sub r3, r6, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
86
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
87 mov r7, #-16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
88 mov r8, r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
89 mov r0, r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
90
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
91 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
92 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
93 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
94 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
95 subs lr, lr, #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
96 vmul.f32 d7, d0, d18
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
97 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
98 vmul.f32 d4, d1, d18
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
99 vmul.f32 d5, d21, d19
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
100 vmul.f32 d6, d20, d19
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
101 vmul.f32 d22, d1, d16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
102 vmul.f32 d23, d21, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
103 vmul.f32 d24, d0, d16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
104 vmul.f32 d25, d20, d17
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
105 vadd.f32 d7, d7, d22
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
106 vadd.f32 d6, d6, d23
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
107 vsub.f32 d4, d4, d24
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
108 vsub.f32 d5, d5, d25
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
109 beq 1f
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
110 vld2.32 {d0-d1}, [r3,:128], r7
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
111 vld2.32 {d20-d21},[r6,:128]!
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
112 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
113 vrev64.32 q3, q3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
114 vst2.32 {d4,d6}, [r0,:128], r7
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
115 vst2.32 {d5,d7}, [r8,:128]!
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
116 b 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
117 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
118 vrev64.32 q3, q3
10160
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
119 vst2.32 {d4,d6}, [r0,:128]
75bab19c59a2 ARM: faster NEON IMDCT
mru
parents: 10153
diff changeset
120 vst2.32 {d5,d7}, [r8,:128]
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
121
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
122 pop {r4-r8,pc}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
123 .endfunc
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
124
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
125 function ff_imdct_calc_neon, export=1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
126 push {r4-r6,lr}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
127
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
128 ldr r3, [r0, #28]
10153
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
129 mov r4, #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
130 mov r5, r1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
131 lsl r4, r4, r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
132 add r1, r1, r4
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
133
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
134 bl ff_imdct_half_neon
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
135
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
136 add r0, r5, r4, lsl #2
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
137 add r1, r5, r4, lsl #1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
138 sub r0, r0, #8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
139 sub r2, r1, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
140 mov r3, #-16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
141 mov r6, #-8
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
142 vmov.i32 d30, #1<<31
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
143 1:
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
144 vld1.32 {d0-d1}, [r2,:128], r3
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
145 pld [r0, #-16]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
146 vrev64.32 q0, q0
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
147 vld1.32 {d2-d3}, [r1,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
148 veor d4, d1, d30
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
149 pld [r2, #-16]
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
150 vrev64.32 q1, q1
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
151 veor d5, d0, d30
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
152 vst1.32 {d2}, [r0,:64], r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
153 vst1.32 {d3}, [r0,:64], r6
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
154 vst1.32 {d4-d5}, [r5,:128]!
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
155 subs r4, r4, #16
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
156 bgt 1b
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
157
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
158 pop {r4-r6,pc}
7a63015e4627 ARM: NEON optimised FFT and MDCT
mru
parents:
diff changeset
159 .endfunc
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
160
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
161 function ff_mdct_calc_neon, export=1
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
162 push {r4-r10,lr}
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
163
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
164 mov r12, #1
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
165 ldr lr, [r0, #28] @ mdct_bits
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
166 ldr r4, [r0, #32] @ tcos
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
167 ldr r3, [r0, #8] @ revtab
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
168 lsl lr, r12, lr @ n = 1 << nbits
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
169 add r7, r2, lr @ in4u
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
170 sub r9, r7, #16 @ in4d
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
171 add r2, r7, lr, lsl #1 @ in3u
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
172 add r8, r9, lr, lsl #1 @ in3d
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
173 add r5, r4, lr, lsl #1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
174 sub r5, r5, #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
175 sub r3, r3, #4
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
176 mov r12, #-16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
177
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
178 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
179 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
180 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
181 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
182 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
183 vsub.f32 d0, d18, d0 @ in4d-in4u I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
184 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
185 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
186 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
187 vadd.f32 d1, d1, d19 @ in3u+in3d -R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
188 vsub.f32 d16, d16, d2 @ in0u-in2d R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
189 vadd.f32 d17, d17, d3 @ in2u+in1d -I
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
190 1:
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
191 vmul.f32 d7, d0, d21 @ I*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
192 ldr r10, [r3, lr, lsr #1]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
193 vmul.f32 d6, d1, d20 @ -R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
194 ldr r6, [r3, #4]!
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
195 vmul.f32 d4, d1, d21 @ -R*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
196 vmul.f32 d5, d0, d20 @ I*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
197 vmul.f32 d24, d16, d30 @ R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
198 vmul.f32 d25, d17, d31 @ -I*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
199 vmul.f32 d22, d16, d31 @ R*s
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
200 vmul.f32 d23, d17, d30 @ I*c
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
201 subs lr, lr, #16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
202 vsub.f32 d6, d6, d7 @ -R*c-I*s
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
203 vadd.f32 d7, d4, d5 @ -R*s+I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
204 vsub.f32 d24, d25, d24 @ I*s-R*c
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
205 vadd.f32 d25, d22, d23 @ R*s-I*c
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
206 beq 1f
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
207 mov r12, #-16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
208 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
209 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
210 vneg.f32 d7, d7 @ R*s-I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
211 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
212 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
213 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
214 vsub.f32 d0, d18, d0 @ in4d-in4u I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
215 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
216 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
217 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
218 vadd.f32 d1, d1, d19 @ in3u+in3d -R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
219 vsub.f32 d16, d16, d2 @ in0u-in2d R
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
220 vadd.f32 d17, d17, d3 @ in2u+in1d -I
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
221 uxth r12, r6, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
222 uxth r6, r6
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
223 add r12, r1, r12, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
224 add r6, r1, r6, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
225 vst2.32 {d6[0],d7[0]}, [r6,:64]
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
226 vst2.32 {d6[1],d7[1]}, [r12,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
227 uxth r6, r10, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
228 uxth r10, r10
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
229 add r6 , r1, r6, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
230 add r10, r1, r10, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
231 vst2.32 {d24[0],d25[0]},[r10,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
232 vst2.32 {d24[1],d25[1]},[r6,:64]
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
233 b 1b
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
234 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
235 vneg.f32 d7, d7 @ R*s-I*c
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
236 uxth r12, r6, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
237 uxth r6, r6
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
238 add r12, r1, r12, lsl #3
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
239 add r6, r1, r6, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
240 vst2.32 {d6[0],d7[0]}, [r6,:64]
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
241 vst2.32 {d6[1],d7[1]}, [r12,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
242 uxth r6, r10, ror #16
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
243 uxth r10, r10
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
244 add r6 , r1, r6, lsl #3
10172
eda985c53dba ARM: 10l: fix large FFTs
mru
parents: 10162
diff changeset
245 add r10, r1, r10, lsl #3
10206
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
246 vst2.32 {d24[0],d25[0]},[r10,:64]
87ab0f0e0baf ARM: merge two loops in ff_mdct_calc_neon
mru
parents: 10205
diff changeset
247 vst2.32 {d24[1],d25[1]},[r6,:64]
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
248
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
249 mov r4, r0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
250 mov r6, r1
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
251 bl ff_fft_calc_neon
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
252
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
253 mov r12, #1
10199
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
254 ldr lr, [r4, #28] @ mdct_bits
38ab367d4231 Merge FFTContext and MDCTContext
mru
parents: 10172
diff changeset
255 ldr r4, [r4, #32] @ tcos
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
256 lsl r12, r12, lr @ n = 1 << nbits
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
257 lsr lr, r12, #3 @ n8 = n >> 3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
258
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
259 add r4, r4, lr, lsl #3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
260 add r6, r6, lr, lsl #3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
261 sub r1, r4, #16
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
262 sub r3, r6, #16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
263
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
264 mov r7, #-16
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
265 mov r8, r6
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
266 mov r0, r3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
267
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
268 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
269 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
270 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
271 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
272 subs lr, lr, #2
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
273 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
274 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
275 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
276 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
277 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
278 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
279 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
280 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
281 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
282 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
283 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
284 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
285 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
286 vneg.f32 q2, q2
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
287 beq 1f
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
288 vld2.32 {d0-d1}, [r3,:128], r7
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
289 vld2.32 {d20-d21},[r6,:128]!
10205
89a852950c34 ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents: 10199
diff changeset
290 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
10162
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
291 vrev64.32 q3, q3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
292 vst2.32 {d4,d6}, [r0,:128], r7
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
293 vst2.32 {d5,d7}, [r8,:128]!
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
294 b 1b
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
295 1:
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
296 vrev64.32 q3, q3
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
297 vst2.32 {d4,d6}, [r0,:128]
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
298 vst2.32 {d5,d7}, [r8,:128]
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
299
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
300 pop {r4-r10,pc}
8d369aee733f ARM: NEON optimised MDCT
mru
parents: 10160
diff changeset
301 .endfunc