Mercurial > libavcodec.hg
annotate arm/mdct_neon.S @ 11034:fd5921186064 libavcodec
Make the fast loop filter path work with unavailable left MBs.
This prevents the issue with having to switch between slow and
fast code paths in each row.
0.5% faster loopfilter for cathedral
author | michael |
---|---|
date | Thu, 28 Jan 2010 02:15:25 +0000 |
parents | be725249ea67 |
children | cbf3161706f4 |
rev | line source |
---|---|
10153 | 1 /* |
2 * ARM NEON optimised MDCT | |
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "asm.S" | |
23 | |
24 .text | |
25 | |
10346 | 26 #define ff_fft_calc_neon X(ff_fft_calc_neon) |
27 | |
10153 | 28 function ff_imdct_half_neon, export=1 |
29 push {r4-r8,lr} | |
30 | |
31 mov r12, #1 | |
10199 | 32 ldr lr, [r0, #28] @ mdct_bits |
33 ldr r4, [r0, #32] @ tcos | |
34 ldr r3, [r0, #8] @ revtab | |
10153 | 35 lsl r12, r12, lr @ n = 1 << nbits |
36 lsr lr, r12, #2 @ n4 = n >> 2 | |
37 add r7, r2, r12, lsl #1 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
38 mov r12, #-16 |
10153 | 39 sub r7, r7, #16 |
40 | |
10160 | 41 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 |
42 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x | |
43 vrev64.32 d17, d17 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
44 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 |
10160 | 45 vmul.f32 d6, d17, d2 |
10153 | 46 vmul.f32 d7, d0, d2 |
47 1: | |
48 subs lr, lr, #2 | |
49 ldr r6, [r3], #4 | |
50 vmul.f32 d4, d0, d3 | |
10160 | 51 vmul.f32 d5, d17, d3 |
10153 | 52 vsub.f32 d4, d6, d4 |
53 vadd.f32 d5, d5, d7 | |
10172 | 54 uxth r8, r6, ror #16 |
55 uxth r6, r6 | |
56 add r8, r1, r8, lsl #3 | |
57 add r6, r1, r6, lsl #3 | |
10153 | 58 beq 1f |
10160 | 59 vld2.32 {d16-d17},[r7,:128],r12 |
60 vld2.32 {d0-d1}, [r2,:128]! | |
61 vrev64.32 d17, d17 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
62 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 |
10160 | 63 vmul.f32 d6, d17, d2 |
10153 | 64 vmul.f32 d7, d0, d2 |
65 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
66 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
67 b 1b | |
68 1: | |
69 vst2.32 {d4[0],d5[0]}, [r6,:64] | |
70 vst2.32 {d4[1],d5[1]}, [r8,:64] | |
71 | |
72 mov r4, r0 | |
73 mov r6, r1 | |
74 bl ff_fft_calc_neon | |
75 | |
76 mov r12, #1 | |
10199 | 77 ldr lr, [r4, #28] @ mdct_bits |
78 ldr r4, [r4, #32] @ tcos | |
10153 | 79 lsl r12, r12, lr @ n = 1 << nbits |
80 lsr lr, r12, #3 @ n8 = n >> 3 | |
81 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
82 add r4, r4, lr, lsl #3 |
10153 | 83 add r6, r6, lr, lsl #3 |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
84 sub r1, r4, #16 |
10153 | 85 sub r3, r6, #16 |
86 | |
87 mov r7, #-16 | |
88 mov r8, r6 | |
89 mov r0, r3 | |
90 | |
10160 | 91 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 |
92 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
93 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 |
10153 | 94 1: |
95 subs lr, lr, #2 | |
96 vmul.f32 d7, d0, d18 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
97 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 |
10153 | 98 vmul.f32 d4, d1, d18 |
99 vmul.f32 d5, d21, d19 | |
100 vmul.f32 d6, d20, d19 | |
101 vmul.f32 d22, d1, d16 | |
102 vmul.f32 d23, d21, d17 | |
103 vmul.f32 d24, d0, d16 | |
104 vmul.f32 d25, d20, d17 | |
105 vadd.f32 d7, d7, d22 | |
106 vadd.f32 d6, d6, d23 | |
107 vsub.f32 d4, d4, d24 | |
108 vsub.f32 d5, d5, d25 | |
109 beq 1f | |
10160 | 110 vld2.32 {d0-d1}, [r3,:128], r7 |
111 vld2.32 {d20-d21},[r6,:128]! | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
112 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 |
10153 | 113 vrev64.32 q3, q3 |
10160 | 114 vst2.32 {d4,d6}, [r0,:128], r7 |
115 vst2.32 {d5,d7}, [r8,:128]! | |
10153 | 116 b 1b |
117 1: | |
118 vrev64.32 q3, q3 | |
10160 | 119 vst2.32 {d4,d6}, [r0,:128] |
120 vst2.32 {d5,d7}, [r8,:128] | |
10153 | 121 |
122 pop {r4-r8,pc} | |
123 .endfunc | |
124 | |
125 function ff_imdct_calc_neon, export=1 | |
126 push {r4-r6,lr} | |
127 | |
10199 | 128 ldr r3, [r0, #28] |
10153 | 129 mov r4, #1 |
130 mov r5, r1 | |
131 lsl r4, r4, r3 | |
132 add r1, r1, r4 | |
133 | |
134 bl ff_imdct_half_neon | |
135 | |
136 add r0, r5, r4, lsl #2 | |
137 add r1, r5, r4, lsl #1 | |
138 sub r0, r0, #8 | |
139 sub r2, r1, #16 | |
140 mov r3, #-16 | |
141 mov r6, #-8 | |
142 vmov.i32 d30, #1<<31 | |
143 1: | |
144 vld1.32 {d0-d1}, [r2,:128], r3 | |
145 pld [r0, #-16] | |
146 vrev64.32 q0, q0 | |
147 vld1.32 {d2-d3}, [r1,:128]! | |
148 veor d4, d1, d30 | |
149 pld [r2, #-16] | |
150 vrev64.32 q1, q1 | |
151 veor d5, d0, d30 | |
152 vst1.32 {d2}, [r0,:64], r6 | |
153 vst1.32 {d3}, [r0,:64], r6 | |
154 vst1.32 {d4-d5}, [r5,:128]! | |
155 subs r4, r4, #16 | |
156 bgt 1b | |
157 | |
158 pop {r4-r6,pc} | |
159 .endfunc | |
10162 | 160 |
161 function ff_mdct_calc_neon, export=1 | |
162 push {r4-r10,lr} | |
163 | |
164 mov r12, #1 | |
10199 | 165 ldr lr, [r0, #28] @ mdct_bits |
166 ldr r4, [r0, #32] @ tcos | |
167 ldr r3, [r0, #8] @ revtab | |
10162 | 168 lsl lr, r12, lr @ n = 1 << nbits |
169 add r7, r2, lr @ in4u | |
170 sub r9, r7, #16 @ in4d | |
171 add r2, r7, lr, lsl #1 @ in3u | |
172 add r8, r9, lr, lsl #1 @ in3d | |
10206 | 173 add r5, r4, lr, lsl #1 |
174 sub r5, r5, #16 | |
175 sub r3, r3, #4 | |
10162 | 176 mov r12, #-16 |
177 | |
10206 | 178 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 |
179 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 | |
180 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 | |
10162 | 181 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
10206 | 182 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 |
183 vsub.f32 d0, d18, d0 @ in4d-in4u I | |
184 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 | |
185 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 | |
186 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 | |
187 vadd.f32 d1, d1, d19 @ in3u+in3d -R | |
188 vsub.f32 d16, d16, d2 @ in0u-in2d R | |
189 vadd.f32 d17, d17, d3 @ in2u+in1d -I | |
10162 | 190 1: |
10206 | 191 vmul.f32 d7, d0, d21 @ I*s |
192 ldr r10, [r3, lr, lsr #1] | |
193 vmul.f32 d6, d1, d20 @ -R*c | |
194 ldr r6, [r3, #4]! | |
195 vmul.f32 d4, d1, d21 @ -R*s | |
196 vmul.f32 d5, d0, d20 @ I*c | |
197 vmul.f32 d24, d16, d30 @ R*c | |
198 vmul.f32 d25, d17, d31 @ -I*s | |
199 vmul.f32 d22, d16, d31 @ R*s | |
200 vmul.f32 d23, d17, d30 @ I*c | |
10162 | 201 subs lr, lr, #16 |
202 vsub.f32 d6, d6, d7 @ -R*c-I*s | |
203 vadd.f32 d7, d4, d5 @ -R*s+I*c | |
10206 | 204 vsub.f32 d24, d25, d24 @ I*s-R*c |
205 vadd.f32 d25, d22, d23 @ R*s-I*c | |
10162 | 206 beq 1f |
10206 | 207 mov r12, #-16 |
208 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 | |
209 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 | |
10162 | 210 vneg.f32 d7, d7 @ R*s-I*c |
10206 | 211 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 |
10162 | 212 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 |
10206 | 213 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 |
214 vsub.f32 d0, d18, d0 @ in4d-in4u I | |
215 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 | |
216 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 | |
217 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 | |
218 vadd.f32 d1, d1, d19 @ in3u+in3d -R | |
219 vsub.f32 d16, d16, d2 @ in0u-in2d R | |
220 vadd.f32 d17, d17, d3 @ in2u+in1d -I | |
221 uxth r12, r6, ror #16 | |
222 uxth r6, r6 | |
223 add r12, r1, r12, lsl #3 | |
224 add r6, r1, r6, lsl #3 | |
10162 | 225 vst2.32 {d6[0],d7[0]}, [r6,:64] |
10206 | 226 vst2.32 {d6[1],d7[1]}, [r12,:64] |
227 uxth r6, r10, ror #16 | |
228 uxth r10, r10 | |
229 add r6 , r1, r6, lsl #3 | |
230 add r10, r1, r10, lsl #3 | |
231 vst2.32 {d24[0],d25[0]},[r10,:64] | |
232 vst2.32 {d24[1],d25[1]},[r6,:64] | |
10162 | 233 b 1b |
234 1: | |
235 vneg.f32 d7, d7 @ R*s-I*c | |
10206 | 236 uxth r12, r6, ror #16 |
237 uxth r6, r6 | |
238 add r12, r1, r12, lsl #3 | |
239 add r6, r1, r6, lsl #3 | |
10162 | 240 vst2.32 {d6[0],d7[0]}, [r6,:64] |
10206 | 241 vst2.32 {d6[1],d7[1]}, [r12,:64] |
242 uxth r6, r10, ror #16 | |
243 uxth r10, r10 | |
244 add r6 , r1, r6, lsl #3 | |
10172 | 245 add r10, r1, r10, lsl #3 |
10206 | 246 vst2.32 {d24[0],d25[0]},[r10,:64] |
247 vst2.32 {d24[1],d25[1]},[r6,:64] | |
10162 | 248 |
249 mov r4, r0 | |
250 mov r6, r1 | |
251 bl ff_fft_calc_neon | |
252 | |
253 mov r12, #1 | |
10199 | 254 ldr lr, [r4, #28] @ mdct_bits |
255 ldr r4, [r4, #32] @ tcos | |
10162 | 256 lsl r12, r12, lr @ n = 1 << nbits |
257 lsr lr, r12, #3 @ n8 = n >> 3 | |
258 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
259 add r4, r4, lr, lsl #3 |
10162 | 260 add r6, r6, lr, lsl #3 |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
261 sub r1, r4, #16 |
10162 | 262 sub r3, r6, #16 |
263 | |
264 mov r7, #-16 | |
265 mov r8, r6 | |
266 mov r0, r3 | |
267 | |
268 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 | |
269 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
270 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 |
10162 | 271 1: |
272 subs lr, lr, #2 | |
273 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
274 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 |
10162 | 275 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 |
276 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 | |
277 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 | |
278 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 | |
279 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 | |
280 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 | |
281 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 | |
282 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 | |
283 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 | |
284 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 | |
285 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 | |
286 vneg.f32 q2, q2 | |
287 beq 1f | |
288 vld2.32 {d0-d1}, [r3,:128], r7 | |
289 vld2.32 {d20-d21},[r6,:128]! | |
10205
89a852950c34
ARM: interleave cos/sin tables for improved NEON MDCT
mru
parents:
10199
diff
changeset
|
290 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 |
10162 | 291 vrev64.32 q3, q3 |
292 vst2.32 {d4,d6}, [r0,:128], r7 | |
293 vst2.32 {d5,d7}, [r8,:128]! | |
294 b 1b | |
295 1: | |
296 vrev64.32 q3, q3 | |
297 vst2.32 {d4,d6}, [r0,:128] | |
298 vst2.32 {d5,d7}, [r8,:128] | |
299 | |
300 pop {r4-r10,pc} | |
301 .endfunc |