Mercurial > libavcodec.hg
annotate arm/dsputil_neon_s.S @ 9467:0d6c8c99382e libavcodec
Remove 2 unused variables from MPV_decode_mb_internal() found by CSA.
author | michael |
---|---|
date | Fri, 17 Apr 2009 14:51:00 +0000 |
parents | 93c20dd3da43 |
children | 51e8f5ab8f1e |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "asm.S" | |
23 | |
24 preserve8 | |
25 .fpu neon | |
26 .text | |
27 | |
28 .macro pixels16 avg=0 | |
29 .if \avg | |
30 mov ip, r0 | |
31 .endif | |
32 1: vld1.64 {d0, d1}, [r1], r2 | |
33 vld1.64 {d2, d3}, [r1], r2 | |
34 vld1.64 {d4, d5}, [r1], r2 | |
35 pld [r1, r2, lsl #2] | |
36 vld1.64 {d6, d7}, [r1], r2 | |
37 pld [r1] | |
38 pld [r1, r2] | |
39 pld [r1, r2, lsl #1] | |
40 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
41 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 42 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
43 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 44 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
45 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 46 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
47 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 48 vrhadd.u8 q3, q3, q11 |
49 .endif | |
50 subs r3, r3, #4 | |
51 vst1.64 {d0, d1}, [r0,:128], r2 | |
52 vst1.64 {d2, d3}, [r0,:128], r2 | |
53 vst1.64 {d4, d5}, [r0,:128], r2 | |
54 vst1.64 {d6, d7}, [r0,:128], r2 | |
55 bne 1b | |
56 bx lr | |
57 .endm | |
58 | |
59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
60 1: vld1.64 {d0-d2}, [r1], r2 | |
61 vld1.64 {d4-d6}, [r1], r2 | |
62 pld [r1] | |
63 pld [r1, r2] | |
64 subs r3, r3, #2 | |
65 vext.8 q1, q0, q1, #1 | |
66 \vhadd q0, q0, q1 | |
67 vext.8 q3, q2, q3, #1 | |
68 \vhadd q2, q2, q3 | |
69 vst1.64 {d0, d1}, [r0,:128], r2 | |
70 vst1.64 {d4, d5}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
76 push {lr} | |
77 add ip, r1, r2 | |
78 lsl lr, r2, #1 | |
79 vld1.64 {d0, d1}, [r1], lr | |
80 vld1.64 {d2, d3}, [ip], lr | |
81 1: subs r3, r3, #2 | |
82 \vhadd q2, q0, q1 | |
83 vld1.64 {d0, d1}, [r1], lr | |
84 \vhadd q3, q0, q1 | |
85 vld1.64 {d2, d3}, [ip], lr | |
86 pld [r1] | |
87 pld [ip] | |
88 vst1.64 {d4, d5}, [r0,:128], r2 | |
89 vst1.64 {d6, d7}, [r0,:128], r2 | |
90 bne 1b | |
91 pop {pc} | |
92 .endm | |
93 | |
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
95 push {lr} | |
96 lsl lr, r2, #1 | |
97 add ip, r1, r2 | |
98 vld1.64 {d0-d2}, [r1], lr | |
99 vld1.64 {d4-d6}, [ip], lr | |
100 .if \no_rnd | |
101 vmov.i16 q13, #1 | |
102 .endif | |
103 pld [r1] | |
104 pld [ip] | |
105 vext.8 q1, q0, q1, #1 | |
106 vext.8 q3, q2, q3, #1 | |
107 vaddl.u8 q8, d0, d2 | |
108 vaddl.u8 q10, d1, d3 | |
109 vaddl.u8 q9, d4, d6 | |
110 vaddl.u8 q11, d5, d7 | |
111 1: subs r3, r3, #2 | |
112 vld1.64 {d0-d2}, [r1], lr | |
113 vadd.u16 q12, q8, q9 | |
114 pld [r1] | |
115 .if \no_rnd | |
116 vadd.u16 q12, q12, q13 | |
117 .endif | |
118 vext.8 q15, q0, q1, #1 | |
119 vadd.u16 q1 , q10, q11 | |
120 \vshrn d28, q12, #2 | |
121 .if \no_rnd | |
122 vadd.u16 q1, q1, q13 | |
123 .endif | |
124 \vshrn d29, q1, #2 | |
125 vaddl.u8 q8, d0, d30 | |
126 vld1.64 {d2-d4}, [ip], lr | |
127 vaddl.u8 q10, d1, d31 | |
128 vst1.64 {d28,d29}, [r0,:128], r2 | |
129 vadd.u16 q12, q8, q9 | |
130 pld [ip] | |
131 .if \no_rnd | |
132 vadd.u16 q12, q12, q13 | |
133 .endif | |
134 vext.8 q2, q1, q2, #1 | |
135 vadd.u16 q0, q10, q11 | |
136 \vshrn d30, q12, #2 | |
137 .if \no_rnd | |
138 vadd.u16 q0, q0, q13 | |
139 .endif | |
140 \vshrn d31, q0, #2 | |
141 vaddl.u8 q9, d2, d4 | |
142 vaddl.u8 q11, d3, d5 | |
143 vst1.64 {d30,d31}, [r0,:128], r2 | |
144 bgt 1b | |
145 pop {pc} | |
146 .endm | |
147 | |
148 .macro pixels8 | |
149 1: vld1.64 {d0}, [r1], r2 | |
150 vld1.64 {d1}, [r1], r2 | |
151 vld1.64 {d2}, [r1], r2 | |
152 pld [r1, r2, lsl #2] | |
153 vld1.64 {d3}, [r1], r2 | |
154 pld [r1] | |
155 pld [r1, r2] | |
156 pld [r1, r2, lsl #1] | |
157 subs r3, r3, #4 | |
158 vst1.64 {d0}, [r0,:64], r2 | |
159 vst1.64 {d1}, [r0,:64], r2 | |
160 vst1.64 {d2}, [r0,:64], r2 | |
161 vst1.64 {d3}, [r0,:64], r2 | |
162 bne 1b | |
163 bx lr | |
164 .endm | |
165 | |
166 .macro pixels8_x2 vhadd=vrhadd.u8 | |
167 1: vld1.64 {d0, d1}, [r1], r2 | |
168 vext.8 d1, d0, d1, #1 | |
169 vld1.64 {d2, d3}, [r1], r2 | |
170 vext.8 d3, d2, d3, #1 | |
171 pld [r1] | |
172 pld [r1, r2] | |
173 subs r3, r3, #2 | |
174 vswp d1, d2 | |
175 \vhadd q0, q0, q1 | |
176 vst1.64 {d0}, [r0,:64], r2 | |
177 vst1.64 {d1}, [r0,:64], r2 | |
178 bne 1b | |
179 bx lr | |
180 .endm | |
181 | |
182 .macro pixels8_y2 vhadd=vrhadd.u8 | |
183 push {lr} | |
184 add ip, r1, r2 | |
185 lsl lr, r2, #1 | |
186 vld1.64 {d0}, [r1], lr | |
187 vld1.64 {d1}, [ip], lr | |
188 1: subs r3, r3, #2 | |
189 \vhadd d4, d0, d1 | |
190 vld1.64 {d0}, [r1], lr | |
191 \vhadd d5, d0, d1 | |
192 vld1.64 {d1}, [ip], lr | |
193 pld [r1] | |
194 pld [ip] | |
195 vst1.64 {d4}, [r0,:64], r2 | |
196 vst1.64 {d5}, [r0,:64], r2 | |
197 bne 1b | |
198 pop {pc} | |
199 .endm | |
200 | |
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
202 push {lr} | |
203 lsl lr, r2, #1 | |
204 add ip, r1, r2 | |
205 vld1.64 {d0, d1}, [r1], lr | |
206 vld1.64 {d2, d3}, [ip], lr | |
207 .if \no_rnd | |
208 vmov.i16 q11, #1 | |
209 .endif | |
210 pld [r1] | |
211 pld [ip] | |
212 vext.8 d4, d0, d1, #1 | |
213 vext.8 d6, d2, d3, #1 | |
214 vaddl.u8 q8, d0, d4 | |
215 vaddl.u8 q9, d2, d6 | |
216 1: subs r3, r3, #2 | |
217 vld1.64 {d0, d1}, [r1], lr | |
218 pld [r1] | |
219 vadd.u16 q10, q8, q9 | |
220 vext.8 d4, d0, d1, #1 | |
221 .if \no_rnd | |
222 vadd.u16 q10, q10, q11 | |
223 .endif | |
224 vaddl.u8 q8, d0, d4 | |
225 \vshrn d5, q10, #2 | |
226 vld1.64 {d2, d3}, [ip], lr | |
227 vadd.u16 q10, q8, q9 | |
228 pld [ip] | |
229 .if \no_rnd | |
230 vadd.u16 q10, q10, q11 | |
231 .endif | |
232 vst1.64 {d5}, [r0,:64], r2 | |
233 \vshrn d7, q10, #2 | |
234 vext.8 d6, d2, d3, #1 | |
235 vaddl.u8 q9, d2, d6 | |
236 vst1.64 {d7}, [r0,:64], r2 | |
237 bgt 1b | |
238 pop {pc} | |
239 .endm | |
240 | |
241 .macro pixfunc pfx name suf rnd_op args:vararg | |
242 function ff_\pfx\name\suf\()_neon, export=1 | |
243 \name \rnd_op \args | |
244 .endfunc | |
245 .endm | |
246 | |
247 .macro pixfunc2 pfx name args:vararg | |
248 pixfunc \pfx \name | |
249 pixfunc \pfx \name \args | |
250 .endm | |
251 | |
252 function ff_put_h264_qpel16_mc00_neon, export=1 | |
253 mov r3, #16 | |
254 .endfunc | |
255 | |
256 pixfunc put_ pixels16 | |
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
260 | |
261 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
262 mov r3, #16 | |
263 .endfunc | |
264 | |
265 pixfunc avg_ pixels16,, 1 | |
266 | |
267 function ff_put_h264_qpel8_mc00_neon, export=1 | |
268 mov r3, #8 | |
269 .endfunc | |
270 | |
271 pixfunc put_ pixels8 | |
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 275 |
9345 | 276 function ff_put_signed_pixels_clamped_neon, export=1 |
277 vmov.u8 d31, #128 | |
278 vld1.64 {d16-d17}, [r0,:128]! | |
279 vqmovn.s16 d0, q8 | |
280 vld1.64 {d18-d19}, [r0,:128]! | |
281 vqmovn.s16 d1, q9 | |
282 vld1.64 {d16-d17}, [r0,:128]! | |
283 vqmovn.s16 d2, q8 | |
284 vld1.64 {d18-d19}, [r0,:128]! | |
285 vadd.u8 d0, d0, d31 | |
286 vld1.64 {d20-d21}, [r0,:128]! | |
287 vadd.u8 d1, d1, d31 | |
288 vld1.64 {d22-d23}, [r0,:128]! | |
289 vadd.u8 d2, d2, d31 | |
290 vst1.64 {d0}, [r1,:64], r2 | |
291 vqmovn.s16 d3, q9 | |
292 vst1.64 {d1}, [r1,:64], r2 | |
293 vqmovn.s16 d4, q10 | |
294 vst1.64 {d2}, [r1,:64], r2 | |
295 vqmovn.s16 d5, q11 | |
296 vld1.64 {d24-d25}, [r0,:128]! | |
297 vadd.u8 d3, d3, d31 | |
298 vld1.64 {d26-d27}, [r0,:128]! | |
299 vadd.u8 d4, d4, d31 | |
300 vadd.u8 d5, d5, d31 | |
301 vst1.64 {d3}, [r1,:64], r2 | |
302 vqmovn.s16 d6, q12 | |
303 vst1.64 {d4}, [r1,:64], r2 | |
304 vqmovn.s16 d7, q13 | |
305 vst1.64 {d5}, [r1,:64], r2 | |
306 vadd.u8 d6, d6, d31 | |
307 vadd.u8 d7, d7, d31 | |
308 vst1.64 {d6}, [r1,:64], r2 | |
309 vst1.64 {d7}, [r1,:64], r2 | |
310 bx lr | |
311 .endfunc | |
312 | |
9344 | 313 function ff_add_pixels_clamped_neon, export=1 |
314 mov r3, r1 | |
315 vld1.64 {d16}, [r1,:64], r2 | |
316 vld1.64 {d0-d1}, [r0,:128]! | |
317 vaddw.u8 q0, q0, d16 | |
318 vld1.64 {d17}, [r1,:64], r2 | |
319 vld1.64 {d2-d3}, [r0,:128]! | |
320 vqmovun.s16 d0, q0 | |
321 vld1.64 {d18}, [r1,:64], r2 | |
322 vaddw.u8 q1, q1, d17 | |
323 vld1.64 {d4-d5}, [r0,:128]! | |
324 vaddw.u8 q2, q2, d18 | |
325 vst1.64 {d0}, [r3,:64], r2 | |
326 vqmovun.s16 d2, q1 | |
327 vld1.64 {d19}, [r1,:64], r2 | |
328 vld1.64 {d6-d7}, [r0,:128]! | |
329 vaddw.u8 q3, q3, d19 | |
330 vqmovun.s16 d4, q2 | |
331 vst1.64 {d2}, [r3,:64], r2 | |
332 vld1.64 {d16}, [r1,:64], r2 | |
333 vqmovun.s16 d6, q3 | |
334 vld1.64 {d0-d1}, [r0,:128]! | |
335 vaddw.u8 q0, q0, d16 | |
336 vst1.64 {d4}, [r3,:64], r2 | |
337 vld1.64 {d17}, [r1,:64], r2 | |
338 vld1.64 {d2-d3}, [r0,:128]! | |
339 vaddw.u8 q1, q1, d17 | |
340 vst1.64 {d6}, [r3,:64], r2 | |
341 vqmovun.s16 d0, q0 | |
342 vld1.64 {d18}, [r1,:64], r2 | |
343 vld1.64 {d4-d5}, [r0,:128]! | |
344 vaddw.u8 q2, q2, d18 | |
345 vst1.64 {d0}, [r3,:64], r2 | |
346 vqmovun.s16 d2, q1 | |
347 vld1.64 {d19}, [r1,:64], r2 | |
348 vqmovun.s16 d4, q2 | |
349 vld1.64 {d6-d7}, [r0,:128]! | |
350 vaddw.u8 q3, q3, d19 | |
351 vst1.64 {d2}, [r3,:64], r2 | |
352 vqmovun.s16 d6, q3 | |
353 vst1.64 {d4}, [r3,:64], r2 | |
354 vst1.64 {d6}, [r3,:64], r2 | |
355 bx lr | |
356 .endfunc | |
357 | |
8492 | 358 function ff_float_to_int16_neon, export=1 |
359 subs r2, r2, #8 | |
360 vld1.64 {d0-d1}, [r1,:128]! | |
361 vcvt.s32.f32 q8, q0, #16 | |
362 vld1.64 {d2-d3}, [r1,:128]! | |
363 vcvt.s32.f32 q9, q1, #16 | |
364 beq 3f | |
365 bics ip, r2, #15 | |
366 beq 2f | |
367 1: subs ip, ip, #16 | |
368 vshrn.s32 d4, q8, #16 | |
369 vld1.64 {d0-d1}, [r1,:128]! | |
370 vcvt.s32.f32 q0, q0, #16 | |
371 vshrn.s32 d5, q9, #16 | |
372 vld1.64 {d2-d3}, [r1,:128]! | |
373 vcvt.s32.f32 q1, q1, #16 | |
374 vshrn.s32 d6, q0, #16 | |
375 vst1.64 {d4-d5}, [r0,:128]! | |
376 vshrn.s32 d7, q1, #16 | |
377 vld1.64 {d16-d17},[r1,:128]! | |
378 vcvt.s32.f32 q8, q8, #16 | |
379 vld1.64 {d18-d19},[r1,:128]! | |
380 vcvt.s32.f32 q9, q9, #16 | |
381 vst1.64 {d6-d7}, [r0,:128]! | |
382 bne 1b | |
383 ands r2, r2, #15 | |
384 beq 3f | |
385 2: vld1.64 {d0-d1}, [r1,:128]! | |
386 vshrn.s32 d4, q8, #16 | |
387 vcvt.s32.f32 q0, q0, #16 | |
388 vld1.64 {d2-d3}, [r1,:128]! | |
389 vshrn.s32 d5, q9, #16 | |
390 vcvt.s32.f32 q1, q1, #16 | |
391 vshrn.s32 d6, q0, #16 | |
392 vst1.64 {d4-d5}, [r0,:128]! | |
393 vshrn.s32 d7, q1, #16 | |
394 vst1.64 {d6-d7}, [r0,:128]! | |
395 bx lr | |
396 3: vshrn.s32 d4, q8, #16 | |
397 vshrn.s32 d5, q9, #16 | |
398 vst1.64 {d4-d5}, [r0,:128]! | |
399 bx lr | |
400 .endfunc | |
401 | |
402 function ff_float_to_int16_interleave_neon, export=1 | |
403 cmp r3, #2 | |
404 ldrlt r1, [r1] | |
405 blt ff_float_to_int16_neon | |
406 bne 4f | |
407 | |
408 ldr r3, [r1] | |
409 ldr r1, [r1, #4] | |
410 | |
411 subs r2, r2, #8 | |
412 vld1.64 {d0-d1}, [r3,:128]! | |
413 vcvt.s32.f32 q8, q0, #16 | |
414 vld1.64 {d2-d3}, [r3,:128]! | |
415 vcvt.s32.f32 q9, q1, #16 | |
416 vld1.64 {d20-d21},[r1,:128]! | |
417 vcvt.s32.f32 q10, q10, #16 | |
418 vld1.64 {d22-d23},[r1,:128]! | |
419 vcvt.s32.f32 q11, q11, #16 | |
420 beq 3f | |
421 bics ip, r2, #15 | |
422 beq 2f | |
423 1: subs ip, ip, #16 | |
424 vld1.64 {d0-d1}, [r3,:128]! | |
425 vcvt.s32.f32 q0, q0, #16 | |
426 vsri.32 q10, q8, #16 | |
427 vld1.64 {d2-d3}, [r3,:128]! | |
428 vcvt.s32.f32 q1, q1, #16 | |
429 vld1.64 {d24-d25},[r1,:128]! | |
430 vcvt.s32.f32 q12, q12, #16 | |
431 vld1.64 {d26-d27},[r1,:128]! | |
432 vsri.32 q11, q9, #16 | |
433 vst1.64 {d20-d21},[r0,:128]! | |
434 vcvt.s32.f32 q13, q13, #16 | |
435 vst1.64 {d22-d23},[r0,:128]! | |
436 vsri.32 q12, q0, #16 | |
437 vld1.64 {d16-d17},[r3,:128]! | |
438 vsri.32 q13, q1, #16 | |
439 vst1.64 {d24-d25},[r0,:128]! | |
440 vcvt.s32.f32 q8, q8, #16 | |
441 vld1.64 {d18-d19},[r3,:128]! | |
442 vcvt.s32.f32 q9, q9, #16 | |
443 vld1.64 {d20-d21},[r1,:128]! | |
444 vcvt.s32.f32 q10, q10, #16 | |
445 vld1.64 {d22-d23},[r1,:128]! | |
446 vcvt.s32.f32 q11, q11, #16 | |
447 vst1.64 {d26-d27},[r0,:128]! | |
448 bne 1b | |
449 ands r2, r2, #15 | |
450 beq 3f | |
451 2: vsri.32 q10, q8, #16 | |
452 vld1.64 {d0-d1}, [r3,:128]! | |
453 vcvt.s32.f32 q0, q0, #16 | |
454 vld1.64 {d2-d3}, [r3,:128]! | |
455 vcvt.s32.f32 q1, q1, #16 | |
456 vld1.64 {d24-d25},[r1,:128]! | |
457 vcvt.s32.f32 q12, q12, #16 | |
458 vsri.32 q11, q9, #16 | |
459 vld1.64 {d26-d27},[r1,:128]! | |
460 vcvt.s32.f32 q13, q13, #16 | |
461 vst1.64 {d20-d21},[r0,:128]! | |
462 vsri.32 q12, q0, #16 | |
463 vst1.64 {d22-d23},[r0,:128]! | |
464 vsri.32 q13, q1, #16 | |
465 vst1.64 {d24-d27},[r0,:128]! | |
466 bx lr | |
467 3: vsri.32 q10, q8, #16 | |
468 vsri.32 q11, q9, #16 | |
469 vst1.64 {d20-d23},[r0,:128]! | |
470 bx lr | |
471 | |
472 4: push {r4-r8,lr} | |
473 cmp r3, #4 | |
474 lsl ip, r3, #1 | |
475 blt 4f | |
476 | |
477 @ 4 channels | |
478 5: ldmia r1!, {r4-r7} | |
479 mov lr, r2 | |
480 mov r8, r0 | |
481 vld1.64 {d16-d17},[r4,:128]! | |
482 vcvt.s32.f32 q8, q8, #16 | |
483 vld1.64 {d18-d19},[r5,:128]! | |
484 vcvt.s32.f32 q9, q9, #16 | |
485 vld1.64 {d20-d21},[r6,:128]! | |
486 vcvt.s32.f32 q10, q10, #16 | |
487 vld1.64 {d22-d23},[r7,:128]! | |
488 vcvt.s32.f32 q11, q11, #16 | |
489 6: subs lr, lr, #8 | |
490 vld1.64 {d0-d1}, [r4,:128]! | |
491 vcvt.s32.f32 q0, q0, #16 | |
492 vsri.32 q9, q8, #16 | |
493 vld1.64 {d2-d3}, [r5,:128]! | |
494 vcvt.s32.f32 q1, q1, #16 | |
495 vsri.32 q11, q10, #16 | |
496 vld1.64 {d4-d5}, [r6,:128]! | |
497 vcvt.s32.f32 q2, q2, #16 | |
498 vzip.32 d18, d22 | |
499 vld1.64 {d6-d7}, [r7,:128]! | |
500 vcvt.s32.f32 q3, q3, #16 | |
501 vzip.32 d19, d23 | |
502 vst1.64 {d18}, [r8], ip | |
503 vsri.32 q1, q0, #16 | |
504 vst1.64 {d22}, [r8], ip | |
505 vsri.32 q3, q2, #16 | |
506 vst1.64 {d19}, [r8], ip | |
507 vzip.32 d2, d6 | |
508 vst1.64 {d23}, [r8], ip | |
509 vzip.32 d3, d7 | |
510 beq 7f | |
511 vld1.64 {d16-d17},[r4,:128]! | |
512 vcvt.s32.f32 q8, q8, #16 | |
513 vst1.64 {d2}, [r8], ip | |
514 vld1.64 {d18-d19},[r5,:128]! | |
515 vcvt.s32.f32 q9, q9, #16 | |
516 vst1.64 {d6}, [r8], ip | |
517 vld1.64 {d20-d21},[r6,:128]! | |
518 vcvt.s32.f32 q10, q10, #16 | |
519 vst1.64 {d3}, [r8], ip | |
520 vld1.64 {d22-d23},[r7,:128]! | |
521 vcvt.s32.f32 q11, q11, #16 | |
522 vst1.64 {d7}, [r8], ip | |
523 b 6b | |
524 7: vst1.64 {d2}, [r8], ip | |
525 vst1.64 {d6}, [r8], ip | |
526 vst1.64 {d3}, [r8], ip | |
527 vst1.64 {d7}, [r8], ip | |
528 subs r3, r3, #4 | |
529 popeq {r4-r8,pc} | |
530 cmp r3, #4 | |
531 add r0, r0, #8 | |
532 bge 5b | |
533 | |
534 @ 2 channels | |
535 4: cmp r3, #2 | |
536 blt 4f | |
537 ldmia r1!, {r4-r5} | |
538 mov lr, r2 | |
539 mov r8, r0 | |
540 tst lr, #8 | |
541 vld1.64 {d16-d17},[r4,:128]! | |
542 vcvt.s32.f32 q8, q8, #16 | |
543 vld1.64 {d18-d19},[r5,:128]! | |
544 vcvt.s32.f32 q9, q9, #16 | |
545 vld1.64 {d20-d21},[r4,:128]! | |
546 vcvt.s32.f32 q10, q10, #16 | |
547 vld1.64 {d22-d23},[r5,:128]! | |
548 vcvt.s32.f32 q11, q11, #16 | |
549 beq 6f | |
550 subs lr, lr, #8 | |
551 beq 7f | |
552 vsri.32 d18, d16, #16 | |
553 vsri.32 d19, d17, #16 | |
554 vld1.64 {d16-d17},[r4,:128]! | |
555 vcvt.s32.f32 q8, q8, #16 | |
556 vst1.32 {d18[0]}, [r8], ip | |
557 vsri.32 d22, d20, #16 | |
558 vst1.32 {d18[1]}, [r8], ip | |
559 vsri.32 d23, d21, #16 | |
560 vst1.32 {d19[0]}, [r8], ip | |
561 vst1.32 {d19[1]}, [r8], ip | |
562 vld1.64 {d18-d19},[r5,:128]! | |
563 vcvt.s32.f32 q9, q9, #16 | |
564 vst1.32 {d22[0]}, [r8], ip | |
565 vst1.32 {d22[1]}, [r8], ip | |
566 vld1.64 {d20-d21},[r4,:128]! | |
567 vcvt.s32.f32 q10, q10, #16 | |
568 vst1.32 {d23[0]}, [r8], ip | |
569 vst1.32 {d23[1]}, [r8], ip | |
570 vld1.64 {d22-d23},[r5,:128]! | |
571 vcvt.s32.f32 q11, q11, #16 | |
572 6: subs lr, lr, #16 | |
573 vld1.64 {d0-d1}, [r4,:128]! | |
574 vcvt.s32.f32 q0, q0, #16 | |
575 vsri.32 d18, d16, #16 | |
576 vld1.64 {d2-d3}, [r5,:128]! | |
577 vcvt.s32.f32 q1, q1, #16 | |
578 vsri.32 d19, d17, #16 | |
579 vld1.64 {d4-d5}, [r4,:128]! | |
580 vcvt.s32.f32 q2, q2, #16 | |
581 vld1.64 {d6-d7}, [r5,:128]! | |
582 vcvt.s32.f32 q3, q3, #16 | |
583 vst1.32 {d18[0]}, [r8], ip | |
584 vsri.32 d22, d20, #16 | |
585 vst1.32 {d18[1]}, [r8], ip | |
586 vsri.32 d23, d21, #16 | |
587 vst1.32 {d19[0]}, [r8], ip | |
588 vsri.32 d2, d0, #16 | |
589 vst1.32 {d19[1]}, [r8], ip | |
590 vsri.32 d3, d1, #16 | |
591 vst1.32 {d22[0]}, [r8], ip | |
592 vsri.32 d6, d4, #16 | |
593 vst1.32 {d22[1]}, [r8], ip | |
594 vsri.32 d7, d5, #16 | |
595 vst1.32 {d23[0]}, [r8], ip | |
596 vst1.32 {d23[1]}, [r8], ip | |
597 beq 6f | |
598 vld1.64 {d16-d17},[r4,:128]! | |
599 vcvt.s32.f32 q8, q8, #16 | |
600 vst1.32 {d2[0]}, [r8], ip | |
601 vst1.32 {d2[1]}, [r8], ip | |
602 vld1.64 {d18-d19},[r5,:128]! | |
603 vcvt.s32.f32 q9, q9, #16 | |
604 vst1.32 {d3[0]}, [r8], ip | |
605 vst1.32 {d3[1]}, [r8], ip | |
606 vld1.64 {d20-d21},[r4,:128]! | |
607 vcvt.s32.f32 q10, q10, #16 | |
608 vst1.32 {d6[0]}, [r8], ip | |
609 vst1.32 {d6[1]}, [r8], ip | |
610 vld1.64 {d22-d23},[r5,:128]! | |
611 vcvt.s32.f32 q11, q11, #16 | |
612 vst1.32 {d7[0]}, [r8], ip | |
613 vst1.32 {d7[1]}, [r8], ip | |
614 bgt 6b | |
615 6: vst1.32 {d2[0]}, [r8], ip | |
616 vst1.32 {d2[1]}, [r8], ip | |
617 vst1.32 {d3[0]}, [r8], ip | |
618 vst1.32 {d3[1]}, [r8], ip | |
619 vst1.32 {d6[0]}, [r8], ip | |
620 vst1.32 {d6[1]}, [r8], ip | |
621 vst1.32 {d7[0]}, [r8], ip | |
622 vst1.32 {d7[1]}, [r8], ip | |
623 b 8f | |
624 7: vsri.32 d18, d16, #16 | |
625 vsri.32 d19, d17, #16 | |
626 vst1.32 {d18[0]}, [r8], ip | |
627 vsri.32 d22, d20, #16 | |
628 vst1.32 {d18[1]}, [r8], ip | |
629 vsri.32 d23, d21, #16 | |
630 vst1.32 {d19[0]}, [r8], ip | |
631 vst1.32 {d19[1]}, [r8], ip | |
632 vst1.32 {d22[0]}, [r8], ip | |
633 vst1.32 {d22[1]}, [r8], ip | |
634 vst1.32 {d23[0]}, [r8], ip | |
635 vst1.32 {d23[1]}, [r8], ip | |
636 8: subs r3, r3, #2 | |
637 add r0, r0, #4 | |
638 popeq {r4-r8,pc} | |
639 | |
640 @ 1 channel | |
641 4: ldr r4, [r1],#4 | |
642 tst r2, #8 | |
643 mov lr, r2 | |
644 mov r5, r0 | |
645 vld1.64 {d0-d1}, [r4,:128]! | |
646 vcvt.s32.f32 q0, q0, #16 | |
647 vld1.64 {d2-d3}, [r4,:128]! | |
648 vcvt.s32.f32 q1, q1, #16 | |
649 bne 8f | |
650 6: subs lr, lr, #16 | |
651 vld1.64 {d4-d5}, [r4,:128]! | |
652 vcvt.s32.f32 q2, q2, #16 | |
653 vld1.64 {d6-d7}, [r4,:128]! | |
654 vcvt.s32.f32 q3, q3, #16 | |
655 vst1.16 {d0[1]}, [r5,:16], ip | |
656 vst1.16 {d0[3]}, [r5,:16], ip | |
657 vst1.16 {d1[1]}, [r5,:16], ip | |
658 vst1.16 {d1[3]}, [r5,:16], ip | |
659 vst1.16 {d2[1]}, [r5,:16], ip | |
660 vst1.16 {d2[3]}, [r5,:16], ip | |
661 vst1.16 {d3[1]}, [r5,:16], ip | |
662 vst1.16 {d3[3]}, [r5,:16], ip | |
663 beq 7f | |
664 vld1.64 {d0-d1}, [r4,:128]! | |
665 vcvt.s32.f32 q0, q0, #16 | |
666 vld1.64 {d2-d3}, [r4,:128]! | |
667 vcvt.s32.f32 q1, q1, #16 | |
668 7: vst1.16 {d4[1]}, [r5,:16], ip | |
669 vst1.16 {d4[3]}, [r5,:16], ip | |
670 vst1.16 {d5[1]}, [r5,:16], ip | |
671 vst1.16 {d5[3]}, [r5,:16], ip | |
672 vst1.16 {d6[1]}, [r5,:16], ip | |
673 vst1.16 {d6[3]}, [r5,:16], ip | |
674 vst1.16 {d7[1]}, [r5,:16], ip | |
675 vst1.16 {d7[3]}, [r5,:16], ip | |
676 bgt 6b | |
677 pop {r4-r8,pc} | |
678 8: subs lr, lr, #8 | |
679 vst1.16 {d0[1]}, [r5,:16], ip | |
680 vst1.16 {d0[3]}, [r5,:16], ip | |
681 vst1.16 {d1[1]}, [r5,:16], ip | |
682 vst1.16 {d1[3]}, [r5,:16], ip | |
683 vst1.16 {d2[1]}, [r5,:16], ip | |
684 vst1.16 {d2[3]}, [r5,:16], ip | |
685 vst1.16 {d3[1]}, [r5,:16], ip | |
686 vst1.16 {d3[3]}, [r5,:16], ip | |
687 popeq {r4-r8,pc} | |
688 vld1.64 {d0-d1}, [r4,:128]! | |
689 vcvt.s32.f32 q0, q0, #16 | |
690 vld1.64 {d2-d3}, [r4,:128]! | |
691 vcvt.s32.f32 q1, q1, #16 | |
692 b 6b | |
693 .endfunc | |
8697 | 694 |
695 function ff_vector_fmul_neon, export=1 | |
696 mov r3, r0 | |
697 subs r2, r2, #8 | |
698 vld1.64 {d0-d3}, [r0,:128]! | |
699 vld1.64 {d4-d7}, [r1,:128]! | |
700 vmul.f32 q8, q0, q2 | |
701 vmul.f32 q9, q1, q3 | |
702 beq 3f | |
703 bics ip, r2, #15 | |
704 beq 2f | |
705 1: subs ip, ip, #16 | |
706 vld1.64 {d0-d1}, [r0,:128]! | |
707 vld1.64 {d4-d5}, [r1,:128]! | |
708 vmul.f32 q10, q0, q2 | |
709 vld1.64 {d2-d3}, [r0,:128]! | |
710 vld1.64 {d6-d7}, [r1,:128]! | |
711 vmul.f32 q11, q1, q3 | |
712 vst1.64 {d16-d19},[r3,:128]! | |
713 vld1.64 {d0-d1}, [r0,:128]! | |
714 vld1.64 {d4-d5}, [r1,:128]! | |
715 vmul.f32 q8, q0, q2 | |
716 vld1.64 {d2-d3}, [r0,:128]! | |
717 vld1.64 {d6-d7}, [r1,:128]! | |
718 vmul.f32 q9, q1, q3 | |
719 vst1.64 {d20-d23},[r3,:128]! | |
720 bne 1b | |
721 ands r2, r2, #15 | |
722 beq 3f | |
723 2: vld1.64 {d0-d1}, [r0,:128]! | |
724 vld1.64 {d4-d5}, [r1,:128]! | |
725 vst1.64 {d16-d17},[r3,:128]! | |
726 vmul.f32 q8, q0, q2 | |
727 vld1.64 {d2-d3}, [r0,:128]! | |
728 vld1.64 {d6-d7}, [r1,:128]! | |
729 vst1.64 {d18-d19},[r3,:128]! | |
730 vmul.f32 q9, q1, q3 | |
731 3: vst1.64 {d16-d19},[r3,:128]! | |
732 bx lr | |
733 .endfunc | |
8698 | 734 |
735 function ff_vector_fmul_window_neon, export=1 | |
736 vld1.32 {d16[],d17[]}, [sp,:32] | |
737 push {r4,r5,lr} | |
738 ldr lr, [sp, #16] | |
739 sub r2, r2, #8 | |
740 sub r5, lr, #2 | |
741 add r2, r2, r5, lsl #2 | |
742 add r4, r3, r5, lsl #3 | |
743 add ip, r0, r5, lsl #3 | |
744 mov r5, #-16 | |
745 vld1.64 {d0,d1}, [r1,:128]! | |
746 vld1.64 {d2,d3}, [r2,:128], r5 | |
747 vld1.64 {d4,d5}, [r3,:128]! | |
748 vld1.64 {d6,d7}, [r4,:128], r5 | |
749 1: subs lr, lr, #4 | |
750 vmov q11, q8 | |
751 vmla.f32 d22, d0, d4 | |
752 vmov q10, q8 | |
753 vmla.f32 d23, d1, d5 | |
754 vrev64.32 q3, q3 | |
755 vmla.f32 d20, d0, d7 | |
756 vrev64.32 q1, q1 | |
757 vmla.f32 d21, d1, d6 | |
758 beq 2f | |
759 vmla.f32 d22, d3, d7 | |
760 vld1.64 {d0,d1}, [r1,:128]! | |
761 vmla.f32 d23, d2, d6 | |
762 vld1.64 {d18,d19},[r2,:128], r5 | |
763 vmls.f32 d20, d3, d4 | |
764 vld1.64 {d24,d25},[r3,:128]! | |
765 vmls.f32 d21, d2, d5 | |
766 vld1.64 {d6,d7}, [r4,:128], r5 | |
767 vmov q1, q9 | |
768 vrev64.32 q11, q11 | |
769 vmov q2, q12 | |
770 vswp d22, d23 | |
771 vst1.64 {d20,d21},[r0,:128]! | |
772 vst1.64 {d22,d23},[ip,:128], r5 | |
773 b 1b | |
774 2: vmla.f32 d22, d3, d7 | |
775 vmla.f32 d23, d2, d6 | |
776 vmls.f32 d20, d3, d4 | |
777 vmls.f32 d21, d2, d5 | |
778 vrev64.32 q11, q11 | |
779 vswp d22, d23 | |
780 vst1.64 {d20,d21},[r0,:128]! | |
781 vst1.64 {d22,d23},[ip,:128], r5 | |
782 pop {r4,r5,pc} | |
783 .endfunc |