Mercurial > libavcodec.hg
comparison arm/dsputil_neon.S @ 10360:b72bb442a775 libavcodec
ARM: clean up file/function naming conventions
author | mru |
---|---|
date | Sun, 04 Oct 2009 13:13:02 +0000 |
parents | arm/dsputil_neon_s.S@be725249ea67 |
children | 199949177888 |
comparison
equal
deleted
inserted
replaced
10359:48be79afc72d | 10360:b72bb442a775 |
---|---|
1 /* | |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "config.h" | |
23 #include "asm.S" | |
24 | |
25 preserve8 | |
26 .text | |
27 | |
28 .macro pixels16 avg=0 | |
29 .if \avg | |
30 mov ip, r0 | |
31 .endif | |
32 1: vld1.64 {d0, d1}, [r1], r2 | |
33 vld1.64 {d2, d3}, [r1], r2 | |
34 vld1.64 {d4, d5}, [r1], r2 | |
35 pld [r1, r2, lsl #2] | |
36 vld1.64 {d6, d7}, [r1], r2 | |
37 pld [r1] | |
38 pld [r1, r2] | |
39 pld [r1, r2, lsl #1] | |
40 .if \avg | |
41 vld1.64 {d16,d17}, [ip,:128], r2 | |
42 vrhadd.u8 q0, q0, q8 | |
43 vld1.64 {d18,d19}, [ip,:128], r2 | |
44 vrhadd.u8 q1, q1, q9 | |
45 vld1.64 {d20,d21}, [ip,:128], r2 | |
46 vrhadd.u8 q2, q2, q10 | |
47 vld1.64 {d22,d23}, [ip,:128], r2 | |
48 vrhadd.u8 q3, q3, q11 | |
49 .endif | |
50 subs r3, r3, #4 | |
51 vst1.64 {d0, d1}, [r0,:128], r2 | |
52 vst1.64 {d2, d3}, [r0,:128], r2 | |
53 vst1.64 {d4, d5}, [r0,:128], r2 | |
54 vst1.64 {d6, d7}, [r0,:128], r2 | |
55 bne 1b | |
56 bx lr | |
57 .endm | |
58 | |
59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
60 1: vld1.64 {d0-d2}, [r1], r2 | |
61 vld1.64 {d4-d6}, [r1], r2 | |
62 pld [r1] | |
63 pld [r1, r2] | |
64 subs r3, r3, #2 | |
65 vext.8 q1, q0, q1, #1 | |
66 \vhadd q0, q0, q1 | |
67 vext.8 q3, q2, q3, #1 | |
68 \vhadd q2, q2, q3 | |
69 vst1.64 {d0, d1}, [r0,:128], r2 | |
70 vst1.64 {d4, d5}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
76 vld1.64 {d0, d1}, [r1], r2 | |
77 vld1.64 {d2, d3}, [r1], r2 | |
78 1: subs r3, r3, #2 | |
79 \vhadd q2, q0, q1 | |
80 vld1.64 {d0, d1}, [r1], r2 | |
81 \vhadd q3, q0, q1 | |
82 vld1.64 {d2, d3}, [r1], r2 | |
83 pld [r1] | |
84 pld [r1, r2] | |
85 vst1.64 {d4, d5}, [r0,:128], r2 | |
86 vst1.64 {d6, d7}, [r0,:128], r2 | |
87 bne 1b | |
88 bx lr | |
89 .endm | |
90 | |
91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
92 vld1.64 {d0-d2}, [r1], r2 | |
93 vld1.64 {d4-d6}, [r1], r2 | |
94 .if \no_rnd | |
95 vmov.i16 q13, #1 | |
96 .endif | |
97 pld [r1] | |
98 pld [r1, r2] | |
99 vext.8 q1, q0, q1, #1 | |
100 vext.8 q3, q2, q3, #1 | |
101 vaddl.u8 q8, d0, d2 | |
102 vaddl.u8 q10, d1, d3 | |
103 vaddl.u8 q9, d4, d6 | |
104 vaddl.u8 q11, d5, d7 | |
105 1: subs r3, r3, #2 | |
106 vld1.64 {d0-d2}, [r1], r2 | |
107 vadd.u16 q12, q8, q9 | |
108 pld [r1] | |
109 .if \no_rnd | |
110 vadd.u16 q12, q12, q13 | |
111 .endif | |
112 vext.8 q15, q0, q1, #1 | |
113 vadd.u16 q1 , q10, q11 | |
114 \vshrn d28, q12, #2 | |
115 .if \no_rnd | |
116 vadd.u16 q1, q1, q13 | |
117 .endif | |
118 \vshrn d29, q1, #2 | |
119 vaddl.u8 q8, d0, d30 | |
120 vld1.64 {d2-d4}, [r1], r2 | |
121 vaddl.u8 q10, d1, d31 | |
122 vst1.64 {d28,d29}, [r0,:128], r2 | |
123 vadd.u16 q12, q8, q9 | |
124 pld [r1, r2] | |
125 .if \no_rnd | |
126 vadd.u16 q12, q12, q13 | |
127 .endif | |
128 vext.8 q2, q1, q2, #1 | |
129 vadd.u16 q0, q10, q11 | |
130 \vshrn d30, q12, #2 | |
131 .if \no_rnd | |
132 vadd.u16 q0, q0, q13 | |
133 .endif | |
134 \vshrn d31, q0, #2 | |
135 vaddl.u8 q9, d2, d4 | |
136 vaddl.u8 q11, d3, d5 | |
137 vst1.64 {d30,d31}, [r0,:128], r2 | |
138 bgt 1b | |
139 bx lr | |
140 .endm | |
141 | |
142 .macro pixels8 | |
143 1: vld1.64 {d0}, [r1], r2 | |
144 vld1.64 {d1}, [r1], r2 | |
145 vld1.64 {d2}, [r1], r2 | |
146 pld [r1, r2, lsl #2] | |
147 vld1.64 {d3}, [r1], r2 | |
148 pld [r1] | |
149 pld [r1, r2] | |
150 pld [r1, r2, lsl #1] | |
151 subs r3, r3, #4 | |
152 vst1.64 {d0}, [r0,:64], r2 | |
153 vst1.64 {d1}, [r0,:64], r2 | |
154 vst1.64 {d2}, [r0,:64], r2 | |
155 vst1.64 {d3}, [r0,:64], r2 | |
156 bne 1b | |
157 bx lr | |
158 .endm | |
159 | |
160 .macro pixels8_x2 vhadd=vrhadd.u8 | |
161 1: vld1.64 {d0, d1}, [r1], r2 | |
162 vext.8 d1, d0, d1, #1 | |
163 vld1.64 {d2, d3}, [r1], r2 | |
164 vext.8 d3, d2, d3, #1 | |
165 pld [r1] | |
166 pld [r1, r2] | |
167 subs r3, r3, #2 | |
168 vswp d1, d2 | |
169 \vhadd q0, q0, q1 | |
170 vst1.64 {d0}, [r0,:64], r2 | |
171 vst1.64 {d1}, [r0,:64], r2 | |
172 bne 1b | |
173 bx lr | |
174 .endm | |
175 | |
176 .macro pixels8_y2 vhadd=vrhadd.u8 | |
177 vld1.64 {d0}, [r1], r2 | |
178 vld1.64 {d1}, [r1], r2 | |
179 1: subs r3, r3, #2 | |
180 \vhadd d4, d0, d1 | |
181 vld1.64 {d0}, [r1], r2 | |
182 \vhadd d5, d0, d1 | |
183 vld1.64 {d1}, [r1], r2 | |
184 pld [r1] | |
185 pld [r1, r2] | |
186 vst1.64 {d4}, [r0,:64], r2 | |
187 vst1.64 {d5}, [r0,:64], r2 | |
188 bne 1b | |
189 bx lr | |
190 .endm | |
191 | |
192 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
193 vld1.64 {d0, d1}, [r1], r2 | |
194 vld1.64 {d2, d3}, [r1], r2 | |
195 .if \no_rnd | |
196 vmov.i16 q11, #1 | |
197 .endif | |
198 pld [r1] | |
199 pld [r1, r2] | |
200 vext.8 d4, d0, d1, #1 | |
201 vext.8 d6, d2, d3, #1 | |
202 vaddl.u8 q8, d0, d4 | |
203 vaddl.u8 q9, d2, d6 | |
204 1: subs r3, r3, #2 | |
205 vld1.64 {d0, d1}, [r1], r2 | |
206 pld [r1] | |
207 vadd.u16 q10, q8, q9 | |
208 vext.8 d4, d0, d1, #1 | |
209 .if \no_rnd | |
210 vadd.u16 q10, q10, q11 | |
211 .endif | |
212 vaddl.u8 q8, d0, d4 | |
213 \vshrn d5, q10, #2 | |
214 vld1.64 {d2, d3}, [r1], r2 | |
215 vadd.u16 q10, q8, q9 | |
216 pld [r1, r2] | |
217 .if \no_rnd | |
218 vadd.u16 q10, q10, q11 | |
219 .endif | |
220 vst1.64 {d5}, [r0,:64], r2 | |
221 \vshrn d7, q10, #2 | |
222 vext.8 d6, d2, d3, #1 | |
223 vaddl.u8 q9, d2, d6 | |
224 vst1.64 {d7}, [r0,:64], r2 | |
225 bgt 1b | |
226 bx lr | |
227 .endm | |
228 | |
229 .macro pixfunc pfx name suf rnd_op args:vararg | |
230 function ff_\pfx\name\suf\()_neon, export=1 | |
231 \name \rnd_op \args | |
232 .endfunc | |
233 .endm | |
234 | |
235 .macro pixfunc2 pfx name args:vararg | |
236 pixfunc \pfx \name | |
237 pixfunc \pfx \name \args | |
238 .endm | |
239 | |
240 function ff_put_h264_qpel16_mc00_neon, export=1 | |
241 mov r3, #16 | |
242 .endfunc | |
243 | |
244 pixfunc put_ pixels16 | |
245 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
246 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
247 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
248 | |
249 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
250 mov r3, #16 | |
251 .endfunc | |
252 | |
253 pixfunc avg_ pixels16,, 1 | |
254 | |
255 function ff_put_h264_qpel8_mc00_neon, export=1 | |
256 mov r3, #8 | |
257 .endfunc | |
258 | |
259 pixfunc put_ pixels8 | |
260 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
261 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
262 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
263 | |
264 function ff_put_pixels_clamped_neon, export=1 | |
265 vld1.64 {d16-d19}, [r0,:128]! | |
266 vqmovun.s16 d0, q8 | |
267 vld1.64 {d20-d23}, [r0,:128]! | |
268 vqmovun.s16 d1, q9 | |
269 vld1.64 {d24-d27}, [r0,:128]! | |
270 vqmovun.s16 d2, q10 | |
271 vld1.64 {d28-d31}, [r0,:128]! | |
272 vqmovun.s16 d3, q11 | |
273 vst1.64 {d0}, [r1,:64], r2 | |
274 vqmovun.s16 d4, q12 | |
275 vst1.64 {d1}, [r1,:64], r2 | |
276 vqmovun.s16 d5, q13 | |
277 vst1.64 {d2}, [r1,:64], r2 | |
278 vqmovun.s16 d6, q14 | |
279 vst1.64 {d3}, [r1,:64], r2 | |
280 vqmovun.s16 d7, q15 | |
281 vst1.64 {d4}, [r1,:64], r2 | |
282 vst1.64 {d5}, [r1,:64], r2 | |
283 vst1.64 {d6}, [r1,:64], r2 | |
284 vst1.64 {d7}, [r1,:64], r2 | |
285 bx lr | |
286 .endfunc | |
287 | |
288 function ff_put_signed_pixels_clamped_neon, export=1 | |
289 vmov.u8 d31, #128 | |
290 vld1.64 {d16-d17}, [r0,:128]! | |
291 vqmovn.s16 d0, q8 | |
292 vld1.64 {d18-d19}, [r0,:128]! | |
293 vqmovn.s16 d1, q9 | |
294 vld1.64 {d16-d17}, [r0,:128]! | |
295 vqmovn.s16 d2, q8 | |
296 vld1.64 {d18-d19}, [r0,:128]! | |
297 vadd.u8 d0, d0, d31 | |
298 vld1.64 {d20-d21}, [r0,:128]! | |
299 vadd.u8 d1, d1, d31 | |
300 vld1.64 {d22-d23}, [r0,:128]! | |
301 vadd.u8 d2, d2, d31 | |
302 vst1.64 {d0}, [r1,:64], r2 | |
303 vqmovn.s16 d3, q9 | |
304 vst1.64 {d1}, [r1,:64], r2 | |
305 vqmovn.s16 d4, q10 | |
306 vst1.64 {d2}, [r1,:64], r2 | |
307 vqmovn.s16 d5, q11 | |
308 vld1.64 {d24-d25}, [r0,:128]! | |
309 vadd.u8 d3, d3, d31 | |
310 vld1.64 {d26-d27}, [r0,:128]! | |
311 vadd.u8 d4, d4, d31 | |
312 vadd.u8 d5, d5, d31 | |
313 vst1.64 {d3}, [r1,:64], r2 | |
314 vqmovn.s16 d6, q12 | |
315 vst1.64 {d4}, [r1,:64], r2 | |
316 vqmovn.s16 d7, q13 | |
317 vst1.64 {d5}, [r1,:64], r2 | |
318 vadd.u8 d6, d6, d31 | |
319 vadd.u8 d7, d7, d31 | |
320 vst1.64 {d6}, [r1,:64], r2 | |
321 vst1.64 {d7}, [r1,:64], r2 | |
322 bx lr | |
323 .endfunc | |
324 | |
325 function ff_add_pixels_clamped_neon, export=1 | |
326 mov r3, r1 | |
327 vld1.64 {d16}, [r1,:64], r2 | |
328 vld1.64 {d0-d1}, [r0,:128]! | |
329 vaddw.u8 q0, q0, d16 | |
330 vld1.64 {d17}, [r1,:64], r2 | |
331 vld1.64 {d2-d3}, [r0,:128]! | |
332 vqmovun.s16 d0, q0 | |
333 vld1.64 {d18}, [r1,:64], r2 | |
334 vaddw.u8 q1, q1, d17 | |
335 vld1.64 {d4-d5}, [r0,:128]! | |
336 vaddw.u8 q2, q2, d18 | |
337 vst1.64 {d0}, [r3,:64], r2 | |
338 vqmovun.s16 d2, q1 | |
339 vld1.64 {d19}, [r1,:64], r2 | |
340 vld1.64 {d6-d7}, [r0,:128]! | |
341 vaddw.u8 q3, q3, d19 | |
342 vqmovun.s16 d4, q2 | |
343 vst1.64 {d2}, [r3,:64], r2 | |
344 vld1.64 {d16}, [r1,:64], r2 | |
345 vqmovun.s16 d6, q3 | |
346 vld1.64 {d0-d1}, [r0,:128]! | |
347 vaddw.u8 q0, q0, d16 | |
348 vst1.64 {d4}, [r3,:64], r2 | |
349 vld1.64 {d17}, [r1,:64], r2 | |
350 vld1.64 {d2-d3}, [r0,:128]! | |
351 vaddw.u8 q1, q1, d17 | |
352 vst1.64 {d6}, [r3,:64], r2 | |
353 vqmovun.s16 d0, q0 | |
354 vld1.64 {d18}, [r1,:64], r2 | |
355 vld1.64 {d4-d5}, [r0,:128]! | |
356 vaddw.u8 q2, q2, d18 | |
357 vst1.64 {d0}, [r3,:64], r2 | |
358 vqmovun.s16 d2, q1 | |
359 vld1.64 {d19}, [r1,:64], r2 | |
360 vqmovun.s16 d4, q2 | |
361 vld1.64 {d6-d7}, [r0,:128]! | |
362 vaddw.u8 q3, q3, d19 | |
363 vst1.64 {d2}, [r3,:64], r2 | |
364 vqmovun.s16 d6, q3 | |
365 vst1.64 {d4}, [r3,:64], r2 | |
366 vst1.64 {d6}, [r3,:64], r2 | |
367 bx lr | |
368 .endfunc | |
369 | |
370 function ff_float_to_int16_neon, export=1 | |
371 subs r2, r2, #8 | |
372 vld1.64 {d0-d1}, [r1,:128]! | |
373 vcvt.s32.f32 q8, q0, #16 | |
374 vld1.64 {d2-d3}, [r1,:128]! | |
375 vcvt.s32.f32 q9, q1, #16 | |
376 beq 3f | |
377 bics ip, r2, #15 | |
378 beq 2f | |
379 1: subs ip, ip, #16 | |
380 vshrn.s32 d4, q8, #16 | |
381 vld1.64 {d0-d1}, [r1,:128]! | |
382 vcvt.s32.f32 q0, q0, #16 | |
383 vshrn.s32 d5, q9, #16 | |
384 vld1.64 {d2-d3}, [r1,:128]! | |
385 vcvt.s32.f32 q1, q1, #16 | |
386 vshrn.s32 d6, q0, #16 | |
387 vst1.64 {d4-d5}, [r0,:128]! | |
388 vshrn.s32 d7, q1, #16 | |
389 vld1.64 {d16-d17},[r1,:128]! | |
390 vcvt.s32.f32 q8, q8, #16 | |
391 vld1.64 {d18-d19},[r1,:128]! | |
392 vcvt.s32.f32 q9, q9, #16 | |
393 vst1.64 {d6-d7}, [r0,:128]! | |
394 bne 1b | |
395 ands r2, r2, #15 | |
396 beq 3f | |
397 2: vld1.64 {d0-d1}, [r1,:128]! | |
398 vshrn.s32 d4, q8, #16 | |
399 vcvt.s32.f32 q0, q0, #16 | |
400 vld1.64 {d2-d3}, [r1,:128]! | |
401 vshrn.s32 d5, q9, #16 | |
402 vcvt.s32.f32 q1, q1, #16 | |
403 vshrn.s32 d6, q0, #16 | |
404 vst1.64 {d4-d5}, [r0,:128]! | |
405 vshrn.s32 d7, q1, #16 | |
406 vst1.64 {d6-d7}, [r0,:128]! | |
407 bx lr | |
408 3: vshrn.s32 d4, q8, #16 | |
409 vshrn.s32 d5, q9, #16 | |
410 vst1.64 {d4-d5}, [r0,:128]! | |
411 bx lr | |
412 .endfunc | |
413 | |
414 function ff_float_to_int16_interleave_neon, export=1 | |
415 cmp r3, #2 | |
416 ldrlt r1, [r1] | |
417 blt ff_float_to_int16_neon | |
418 bne 4f | |
419 | |
420 ldr r3, [r1] | |
421 ldr r1, [r1, #4] | |
422 | |
423 subs r2, r2, #8 | |
424 vld1.64 {d0-d1}, [r3,:128]! | |
425 vcvt.s32.f32 q8, q0, #16 | |
426 vld1.64 {d2-d3}, [r3,:128]! | |
427 vcvt.s32.f32 q9, q1, #16 | |
428 vld1.64 {d20-d21},[r1,:128]! | |
429 vcvt.s32.f32 q10, q10, #16 | |
430 vld1.64 {d22-d23},[r1,:128]! | |
431 vcvt.s32.f32 q11, q11, #16 | |
432 beq 3f | |
433 bics ip, r2, #15 | |
434 beq 2f | |
435 1: subs ip, ip, #16 | |
436 vld1.64 {d0-d1}, [r3,:128]! | |
437 vcvt.s32.f32 q0, q0, #16 | |
438 vsri.32 q10, q8, #16 | |
439 vld1.64 {d2-d3}, [r3,:128]! | |
440 vcvt.s32.f32 q1, q1, #16 | |
441 vld1.64 {d24-d25},[r1,:128]! | |
442 vcvt.s32.f32 q12, q12, #16 | |
443 vld1.64 {d26-d27},[r1,:128]! | |
444 vsri.32 q11, q9, #16 | |
445 vst1.64 {d20-d21},[r0,:128]! | |
446 vcvt.s32.f32 q13, q13, #16 | |
447 vst1.64 {d22-d23},[r0,:128]! | |
448 vsri.32 q12, q0, #16 | |
449 vld1.64 {d16-d17},[r3,:128]! | |
450 vsri.32 q13, q1, #16 | |
451 vst1.64 {d24-d25},[r0,:128]! | |
452 vcvt.s32.f32 q8, q8, #16 | |
453 vld1.64 {d18-d19},[r3,:128]! | |
454 vcvt.s32.f32 q9, q9, #16 | |
455 vld1.64 {d20-d21},[r1,:128]! | |
456 vcvt.s32.f32 q10, q10, #16 | |
457 vld1.64 {d22-d23},[r1,:128]! | |
458 vcvt.s32.f32 q11, q11, #16 | |
459 vst1.64 {d26-d27},[r0,:128]! | |
460 bne 1b | |
461 ands r2, r2, #15 | |
462 beq 3f | |
463 2: vsri.32 q10, q8, #16 | |
464 vld1.64 {d0-d1}, [r3,:128]! | |
465 vcvt.s32.f32 q0, q0, #16 | |
466 vld1.64 {d2-d3}, [r3,:128]! | |
467 vcvt.s32.f32 q1, q1, #16 | |
468 vld1.64 {d24-d25},[r1,:128]! | |
469 vcvt.s32.f32 q12, q12, #16 | |
470 vsri.32 q11, q9, #16 | |
471 vld1.64 {d26-d27},[r1,:128]! | |
472 vcvt.s32.f32 q13, q13, #16 | |
473 vst1.64 {d20-d21},[r0,:128]! | |
474 vsri.32 q12, q0, #16 | |
475 vst1.64 {d22-d23},[r0,:128]! | |
476 vsri.32 q13, q1, #16 | |
477 vst1.64 {d24-d27},[r0,:128]! | |
478 bx lr | |
479 3: vsri.32 q10, q8, #16 | |
480 vsri.32 q11, q9, #16 | |
481 vst1.64 {d20-d23},[r0,:128]! | |
482 bx lr | |
483 | |
484 4: push {r4-r8,lr} | |
485 cmp r3, #4 | |
486 lsl ip, r3, #1 | |
487 blt 4f | |
488 | |
489 @ 4 channels | |
490 5: ldmia r1!, {r4-r7} | |
491 mov lr, r2 | |
492 mov r8, r0 | |
493 vld1.64 {d16-d17},[r4,:128]! | |
494 vcvt.s32.f32 q8, q8, #16 | |
495 vld1.64 {d18-d19},[r5,:128]! | |
496 vcvt.s32.f32 q9, q9, #16 | |
497 vld1.64 {d20-d21},[r6,:128]! | |
498 vcvt.s32.f32 q10, q10, #16 | |
499 vld1.64 {d22-d23},[r7,:128]! | |
500 vcvt.s32.f32 q11, q11, #16 | |
501 6: subs lr, lr, #8 | |
502 vld1.64 {d0-d1}, [r4,:128]! | |
503 vcvt.s32.f32 q0, q0, #16 | |
504 vsri.32 q9, q8, #16 | |
505 vld1.64 {d2-d3}, [r5,:128]! | |
506 vcvt.s32.f32 q1, q1, #16 | |
507 vsri.32 q11, q10, #16 | |
508 vld1.64 {d4-d5}, [r6,:128]! | |
509 vcvt.s32.f32 q2, q2, #16 | |
510 vzip.32 d18, d22 | |
511 vld1.64 {d6-d7}, [r7,:128]! | |
512 vcvt.s32.f32 q3, q3, #16 | |
513 vzip.32 d19, d23 | |
514 vst1.64 {d18}, [r8], ip | |
515 vsri.32 q1, q0, #16 | |
516 vst1.64 {d22}, [r8], ip | |
517 vsri.32 q3, q2, #16 | |
518 vst1.64 {d19}, [r8], ip | |
519 vzip.32 d2, d6 | |
520 vst1.64 {d23}, [r8], ip | |
521 vzip.32 d3, d7 | |
522 beq 7f | |
523 vld1.64 {d16-d17},[r4,:128]! | |
524 vcvt.s32.f32 q8, q8, #16 | |
525 vst1.64 {d2}, [r8], ip | |
526 vld1.64 {d18-d19},[r5,:128]! | |
527 vcvt.s32.f32 q9, q9, #16 | |
528 vst1.64 {d6}, [r8], ip | |
529 vld1.64 {d20-d21},[r6,:128]! | |
530 vcvt.s32.f32 q10, q10, #16 | |
531 vst1.64 {d3}, [r8], ip | |
532 vld1.64 {d22-d23},[r7,:128]! | |
533 vcvt.s32.f32 q11, q11, #16 | |
534 vst1.64 {d7}, [r8], ip | |
535 b 6b | |
536 7: vst1.64 {d2}, [r8], ip | |
537 vst1.64 {d6}, [r8], ip | |
538 vst1.64 {d3}, [r8], ip | |
539 vst1.64 {d7}, [r8], ip | |
540 subs r3, r3, #4 | |
541 popeq {r4-r8,pc} | |
542 cmp r3, #4 | |
543 add r0, r0, #8 | |
544 bge 5b | |
545 | |
546 @ 2 channels | |
547 4: cmp r3, #2 | |
548 blt 4f | |
549 ldmia r1!, {r4-r5} | |
550 mov lr, r2 | |
551 mov r8, r0 | |
552 tst lr, #8 | |
553 vld1.64 {d16-d17},[r4,:128]! | |
554 vcvt.s32.f32 q8, q8, #16 | |
555 vld1.64 {d18-d19},[r5,:128]! | |
556 vcvt.s32.f32 q9, q9, #16 | |
557 vld1.64 {d20-d21},[r4,:128]! | |
558 vcvt.s32.f32 q10, q10, #16 | |
559 vld1.64 {d22-d23},[r5,:128]! | |
560 vcvt.s32.f32 q11, q11, #16 | |
561 beq 6f | |
562 subs lr, lr, #8 | |
563 beq 7f | |
564 vsri.32 d18, d16, #16 | |
565 vsri.32 d19, d17, #16 | |
566 vld1.64 {d16-d17},[r4,:128]! | |
567 vcvt.s32.f32 q8, q8, #16 | |
568 vst1.32 {d18[0]}, [r8], ip | |
569 vsri.32 d22, d20, #16 | |
570 vst1.32 {d18[1]}, [r8], ip | |
571 vsri.32 d23, d21, #16 | |
572 vst1.32 {d19[0]}, [r8], ip | |
573 vst1.32 {d19[1]}, [r8], ip | |
574 vld1.64 {d18-d19},[r5,:128]! | |
575 vcvt.s32.f32 q9, q9, #16 | |
576 vst1.32 {d22[0]}, [r8], ip | |
577 vst1.32 {d22[1]}, [r8], ip | |
578 vld1.64 {d20-d21},[r4,:128]! | |
579 vcvt.s32.f32 q10, q10, #16 | |
580 vst1.32 {d23[0]}, [r8], ip | |
581 vst1.32 {d23[1]}, [r8], ip | |
582 vld1.64 {d22-d23},[r5,:128]! | |
583 vcvt.s32.f32 q11, q11, #16 | |
584 6: subs lr, lr, #16 | |
585 vld1.64 {d0-d1}, [r4,:128]! | |
586 vcvt.s32.f32 q0, q0, #16 | |
587 vsri.32 d18, d16, #16 | |
588 vld1.64 {d2-d3}, [r5,:128]! | |
589 vcvt.s32.f32 q1, q1, #16 | |
590 vsri.32 d19, d17, #16 | |
591 vld1.64 {d4-d5}, [r4,:128]! | |
592 vcvt.s32.f32 q2, q2, #16 | |
593 vld1.64 {d6-d7}, [r5,:128]! | |
594 vcvt.s32.f32 q3, q3, #16 | |
595 vst1.32 {d18[0]}, [r8], ip | |
596 vsri.32 d22, d20, #16 | |
597 vst1.32 {d18[1]}, [r8], ip | |
598 vsri.32 d23, d21, #16 | |
599 vst1.32 {d19[0]}, [r8], ip | |
600 vsri.32 d2, d0, #16 | |
601 vst1.32 {d19[1]}, [r8], ip | |
602 vsri.32 d3, d1, #16 | |
603 vst1.32 {d22[0]}, [r8], ip | |
604 vsri.32 d6, d4, #16 | |
605 vst1.32 {d22[1]}, [r8], ip | |
606 vsri.32 d7, d5, #16 | |
607 vst1.32 {d23[0]}, [r8], ip | |
608 vst1.32 {d23[1]}, [r8], ip | |
609 beq 6f | |
610 vld1.64 {d16-d17},[r4,:128]! | |
611 vcvt.s32.f32 q8, q8, #16 | |
612 vst1.32 {d2[0]}, [r8], ip | |
613 vst1.32 {d2[1]}, [r8], ip | |
614 vld1.64 {d18-d19},[r5,:128]! | |
615 vcvt.s32.f32 q9, q9, #16 | |
616 vst1.32 {d3[0]}, [r8], ip | |
617 vst1.32 {d3[1]}, [r8], ip | |
618 vld1.64 {d20-d21},[r4,:128]! | |
619 vcvt.s32.f32 q10, q10, #16 | |
620 vst1.32 {d6[0]}, [r8], ip | |
621 vst1.32 {d6[1]}, [r8], ip | |
622 vld1.64 {d22-d23},[r5,:128]! | |
623 vcvt.s32.f32 q11, q11, #16 | |
624 vst1.32 {d7[0]}, [r8], ip | |
625 vst1.32 {d7[1]}, [r8], ip | |
626 bgt 6b | |
627 6: vst1.32 {d2[0]}, [r8], ip | |
628 vst1.32 {d2[1]}, [r8], ip | |
629 vst1.32 {d3[0]}, [r8], ip | |
630 vst1.32 {d3[1]}, [r8], ip | |
631 vst1.32 {d6[0]}, [r8], ip | |
632 vst1.32 {d6[1]}, [r8], ip | |
633 vst1.32 {d7[0]}, [r8], ip | |
634 vst1.32 {d7[1]}, [r8], ip | |
635 b 8f | |
636 7: vsri.32 d18, d16, #16 | |
637 vsri.32 d19, d17, #16 | |
638 vst1.32 {d18[0]}, [r8], ip | |
639 vsri.32 d22, d20, #16 | |
640 vst1.32 {d18[1]}, [r8], ip | |
641 vsri.32 d23, d21, #16 | |
642 vst1.32 {d19[0]}, [r8], ip | |
643 vst1.32 {d19[1]}, [r8], ip | |
644 vst1.32 {d22[0]}, [r8], ip | |
645 vst1.32 {d22[1]}, [r8], ip | |
646 vst1.32 {d23[0]}, [r8], ip | |
647 vst1.32 {d23[1]}, [r8], ip | |
648 8: subs r3, r3, #2 | |
649 add r0, r0, #4 | |
650 popeq {r4-r8,pc} | |
651 | |
652 @ 1 channel | |
653 4: ldr r4, [r1],#4 | |
654 tst r2, #8 | |
655 mov lr, r2 | |
656 mov r5, r0 | |
657 vld1.64 {d0-d1}, [r4,:128]! | |
658 vcvt.s32.f32 q0, q0, #16 | |
659 vld1.64 {d2-d3}, [r4,:128]! | |
660 vcvt.s32.f32 q1, q1, #16 | |
661 bne 8f | |
662 6: subs lr, lr, #16 | |
663 vld1.64 {d4-d5}, [r4,:128]! | |
664 vcvt.s32.f32 q2, q2, #16 | |
665 vld1.64 {d6-d7}, [r4,:128]! | |
666 vcvt.s32.f32 q3, q3, #16 | |
667 vst1.16 {d0[1]}, [r5,:16], ip | |
668 vst1.16 {d0[3]}, [r5,:16], ip | |
669 vst1.16 {d1[1]}, [r5,:16], ip | |
670 vst1.16 {d1[3]}, [r5,:16], ip | |
671 vst1.16 {d2[1]}, [r5,:16], ip | |
672 vst1.16 {d2[3]}, [r5,:16], ip | |
673 vst1.16 {d3[1]}, [r5,:16], ip | |
674 vst1.16 {d3[3]}, [r5,:16], ip | |
675 beq 7f | |
676 vld1.64 {d0-d1}, [r4,:128]! | |
677 vcvt.s32.f32 q0, q0, #16 | |
678 vld1.64 {d2-d3}, [r4,:128]! | |
679 vcvt.s32.f32 q1, q1, #16 | |
680 7: vst1.16 {d4[1]}, [r5,:16], ip | |
681 vst1.16 {d4[3]}, [r5,:16], ip | |
682 vst1.16 {d5[1]}, [r5,:16], ip | |
683 vst1.16 {d5[3]}, [r5,:16], ip | |
684 vst1.16 {d6[1]}, [r5,:16], ip | |
685 vst1.16 {d6[3]}, [r5,:16], ip | |
686 vst1.16 {d7[1]}, [r5,:16], ip | |
687 vst1.16 {d7[3]}, [r5,:16], ip | |
688 bgt 6b | |
689 pop {r4-r8,pc} | |
690 8: subs lr, lr, #8 | |
691 vst1.16 {d0[1]}, [r5,:16], ip | |
692 vst1.16 {d0[3]}, [r5,:16], ip | |
693 vst1.16 {d1[1]}, [r5,:16], ip | |
694 vst1.16 {d1[3]}, [r5,:16], ip | |
695 vst1.16 {d2[1]}, [r5,:16], ip | |
696 vst1.16 {d2[3]}, [r5,:16], ip | |
697 vst1.16 {d3[1]}, [r5,:16], ip | |
698 vst1.16 {d3[3]}, [r5,:16], ip | |
699 popeq {r4-r8,pc} | |
700 vld1.64 {d0-d1}, [r4,:128]! | |
701 vcvt.s32.f32 q0, q0, #16 | |
702 vld1.64 {d2-d3}, [r4,:128]! | |
703 vcvt.s32.f32 q1, q1, #16 | |
704 b 6b | |
705 .endfunc | |
706 | |
707 function ff_vector_fmul_neon, export=1 | |
708 mov r3, r0 | |
709 subs r2, r2, #8 | |
710 vld1.64 {d0-d3}, [r0,:128]! | |
711 vld1.64 {d4-d7}, [r1,:128]! | |
712 vmul.f32 q8, q0, q2 | |
713 vmul.f32 q9, q1, q3 | |
714 beq 3f | |
715 bics ip, r2, #15 | |
716 beq 2f | |
717 1: subs ip, ip, #16 | |
718 vld1.64 {d0-d1}, [r0,:128]! | |
719 vld1.64 {d4-d5}, [r1,:128]! | |
720 vmul.f32 q10, q0, q2 | |
721 vld1.64 {d2-d3}, [r0,:128]! | |
722 vld1.64 {d6-d7}, [r1,:128]! | |
723 vmul.f32 q11, q1, q3 | |
724 vst1.64 {d16-d19},[r3,:128]! | |
725 vld1.64 {d0-d1}, [r0,:128]! | |
726 vld1.64 {d4-d5}, [r1,:128]! | |
727 vmul.f32 q8, q0, q2 | |
728 vld1.64 {d2-d3}, [r0,:128]! | |
729 vld1.64 {d6-d7}, [r1,:128]! | |
730 vmul.f32 q9, q1, q3 | |
731 vst1.64 {d20-d23},[r3,:128]! | |
732 bne 1b | |
733 ands r2, r2, #15 | |
734 beq 3f | |
735 2: vld1.64 {d0-d1}, [r0,:128]! | |
736 vld1.64 {d4-d5}, [r1,:128]! | |
737 vst1.64 {d16-d17},[r3,:128]! | |
738 vmul.f32 q8, q0, q2 | |
739 vld1.64 {d2-d3}, [r0,:128]! | |
740 vld1.64 {d6-d7}, [r1,:128]! | |
741 vst1.64 {d18-d19},[r3,:128]! | |
742 vmul.f32 q9, q1, q3 | |
743 3: vst1.64 {d16-d19},[r3,:128]! | |
744 bx lr | |
745 .endfunc | |
746 | |
747 function ff_vector_fmul_window_neon, export=1 | |
748 VFP vdup.32 q8, d0[0] | |
749 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] | |
750 push {r4,r5,lr} | |
751 VFP ldr lr, [sp, #12] | |
752 NOVFP ldr lr, [sp, #16] | |
753 sub r2, r2, #8 | |
754 sub r5, lr, #2 | |
755 add r2, r2, r5, lsl #2 | |
756 add r4, r3, r5, lsl #3 | |
757 add ip, r0, r5, lsl #3 | |
758 mov r5, #-16 | |
759 vld1.64 {d0,d1}, [r1,:128]! | |
760 vld1.64 {d2,d3}, [r2,:128], r5 | |
761 vld1.64 {d4,d5}, [r3,:128]! | |
762 vld1.64 {d6,d7}, [r4,:128], r5 | |
763 1: subs lr, lr, #4 | |
764 vmov q11, q8 | |
765 vmla.f32 d22, d0, d4 | |
766 vmov q10, q8 | |
767 vmla.f32 d23, d1, d5 | |
768 vrev64.32 q3, q3 | |
769 vmla.f32 d20, d0, d7 | |
770 vrev64.32 q1, q1 | |
771 vmla.f32 d21, d1, d6 | |
772 beq 2f | |
773 vmla.f32 d22, d3, d7 | |
774 vld1.64 {d0,d1}, [r1,:128]! | |
775 vmla.f32 d23, d2, d6 | |
776 vld1.64 {d18,d19},[r2,:128], r5 | |
777 vmls.f32 d20, d3, d4 | |
778 vld1.64 {d24,d25},[r3,:128]! | |
779 vmls.f32 d21, d2, d5 | |
780 vld1.64 {d6,d7}, [r4,:128], r5 | |
781 vmov q1, q9 | |
782 vrev64.32 q11, q11 | |
783 vmov q2, q12 | |
784 vswp d22, d23 | |
785 vst1.64 {d20,d21},[r0,:128]! | |
786 vst1.64 {d22,d23},[ip,:128], r5 | |
787 b 1b | |
788 2: vmla.f32 d22, d3, d7 | |
789 vmla.f32 d23, d2, d6 | |
790 vmls.f32 d20, d3, d4 | |
791 vmls.f32 d21, d2, d5 | |
792 vrev64.32 q11, q11 | |
793 vswp d22, d23 | |
794 vst1.64 {d20,d21},[r0,:128]! | |
795 vst1.64 {d22,d23},[ip,:128], r5 | |
796 pop {r4,r5,pc} | |
797 .endfunc | |
798 | |
799 #if CONFIG_VORBIS_DECODER | |
800 function ff_vorbis_inverse_coupling_neon, export=1 | |
801 vmov.i32 q10, #1<<31 | |
802 subs r2, r2, #4 | |
803 mov r3, r0 | |
804 mov r12, r1 | |
805 beq 3f | |
806 | |
807 vld1.32 {d24-d25},[r1,:128]! | |
808 vld1.32 {d22-d23},[r0,:128]! | |
809 vcle.s32 q8, q12, #0 | |
810 vand q9, q11, q10 | |
811 veor q12, q12, q9 | |
812 vand q2, q12, q8 | |
813 vbic q3, q12, q8 | |
814 vadd.f32 q12, q11, q2 | |
815 vsub.f32 q11, q11, q3 | |
816 1: vld1.32 {d2-d3}, [r1,:128]! | |
817 vld1.32 {d0-d1}, [r0,:128]! | |
818 vcle.s32 q8, q1, #0 | |
819 vand q9, q0, q10 | |
820 veor q1, q1, q9 | |
821 vst1.32 {d24-d25},[r3, :128]! | |
822 vst1.32 {d22-d23},[r12,:128]! | |
823 vand q2, q1, q8 | |
824 vbic q3, q1, q8 | |
825 vadd.f32 q1, q0, q2 | |
826 vsub.f32 q0, q0, q3 | |
827 subs r2, r2, #8 | |
828 ble 2f | |
829 vld1.32 {d24-d25},[r1,:128]! | |
830 vld1.32 {d22-d23},[r0,:128]! | |
831 vcle.s32 q8, q12, #0 | |
832 vand q9, q11, q10 | |
833 veor q12, q12, q9 | |
834 vst1.32 {d2-d3}, [r3, :128]! | |
835 vst1.32 {d0-d1}, [r12,:128]! | |
836 vand q2, q12, q8 | |
837 vbic q3, q12, q8 | |
838 vadd.f32 q12, q11, q2 | |
839 vsub.f32 q11, q11, q3 | |
840 b 1b | |
841 | |
842 2: vst1.32 {d2-d3}, [r3, :128]! | |
843 vst1.32 {d0-d1}, [r12,:128]! | |
844 bxlt lr | |
845 | |
846 3: vld1.32 {d2-d3}, [r1,:128] | |
847 vld1.32 {d0-d1}, [r0,:128] | |
848 vcle.s32 q8, q1, #0 | |
849 vand q9, q0, q10 | |
850 veor q1, q1, q9 | |
851 vand q2, q1, q8 | |
852 vbic q3, q1, q8 | |
853 vadd.f32 q1, q0, q2 | |
854 vsub.f32 q0, q0, q3 | |
855 vst1.32 {d2-d3}, [r0,:128]! | |
856 vst1.32 {d0-d1}, [r1,:128]! | |
857 bx lr | |
858 .endfunc | |
859 #endif | |
860 | |
861 function ff_vector_fmul_scalar_neon, export=1 | |
862 VFP len .req r2 | |
863 NOVFP len .req r3 | |
864 VFP vdup.32 q8, d0[0] | |
865 NOVFP vdup.32 q8, r2 | |
866 bics r12, len, #15 | |
867 beq 3f | |
868 vld1.32 {q0},[r1,:128]! | |
869 vld1.32 {q1},[r1,:128]! | |
870 1: vmul.f32 q0, q0, q8 | |
871 vld1.32 {q2},[r1,:128]! | |
872 vmul.f32 q1, q1, q8 | |
873 vld1.32 {q3},[r1,:128]! | |
874 vmul.f32 q2, q2, q8 | |
875 vst1.32 {q0},[r0,:128]! | |
876 vmul.f32 q3, q3, q8 | |
877 vst1.32 {q1},[r0,:128]! | |
878 subs r12, r12, #16 | |
879 beq 2f | |
880 vld1.32 {q0},[r1,:128]! | |
881 vst1.32 {q2},[r0,:128]! | |
882 vld1.32 {q1},[r1,:128]! | |
883 vst1.32 {q3},[r0,:128]! | |
884 b 1b | |
885 2: vst1.32 {q2},[r0,:128]! | |
886 vst1.32 {q3},[r0,:128]! | |
887 ands len, len, #15 | |
888 bxeq lr | |
889 3: vld1.32 {q0},[r1,:128]! | |
890 vmul.f32 q0, q0, q8 | |
891 vst1.32 {q0},[r0,:128]! | |
892 subs len, len, #4 | |
893 bgt 3b | |
894 bx lr | |
895 .unreq len | |
896 .endfunc | |
897 | |
898 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
899 VFP vdup.32 d16, d0[0] | |
900 NOVFP vdup.32 d16, r3 | |
901 NOVFP ldr r3, [sp] | |
902 vld1.32 {d0},[r1,:64]! | |
903 vld1.32 {d1},[r1,:64]! | |
904 1: subs r3, r3, #4 | |
905 vmul.f32 d4, d0, d16 | |
906 vmul.f32 d5, d1, d16 | |
907 ldr r12, [r2], #4 | |
908 vld1.32 {d2},[r12,:64] | |
909 ldr r12, [r2], #4 | |
910 vld1.32 {d3},[r12,:64] | |
911 vmul.f32 d4, d4, d2 | |
912 vmul.f32 d5, d5, d3 | |
913 beq 2f | |
914 vld1.32 {d0},[r1,:64]! | |
915 vld1.32 {d1},[r1,:64]! | |
916 vst1.32 {d4},[r0,:64]! | |
917 vst1.32 {d5},[r0,:64]! | |
918 b 1b | |
919 2: vst1.32 {d4},[r0,:64]! | |
920 vst1.32 {d5},[r0,:64]! | |
921 bx lr | |
922 .endfunc | |
923 | |
924 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
925 VFP vdup.32 q10, d0[0] | |
926 NOVFP vdup.32 q10, r3 | |
927 NOVFP ldr r3, [sp] | |
928 push {lr} | |
929 bics lr, r3, #7 | |
930 beq 3f | |
931 vld1.32 {q0},[r1,:128]! | |
932 vld1.32 {q2},[r1,:128]! | |
933 1: ldr r12, [r2], #4 | |
934 vld1.32 {q1},[r12,:128] | |
935 ldr r12, [r2], #4 | |
936 vld1.32 {q3},[r12,:128] | |
937 vmul.f32 q8, q0, q10 | |
938 vmul.f32 q8, q8, q1 | |
939 vmul.f32 q9, q2, q10 | |
940 vmul.f32 q9, q9, q3 | |
941 subs lr, lr, #8 | |
942 beq 2f | |
943 vld1.32 {q0},[r1,:128]! | |
944 vld1.32 {q2},[r1,:128]! | |
945 vst1.32 {q8},[r0,:128]! | |
946 vst1.32 {q9},[r0,:128]! | |
947 b 1b | |
948 2: vst1.32 {q8},[r0,:128]! | |
949 vst1.32 {q9},[r0,:128]! | |
950 ands r3, r3, #7 | |
951 popeq {pc} | |
952 3: vld1.32 {q0},[r1,:128]! | |
953 ldr r12, [r2], #4 | |
954 vld1.32 {q1},[r12,:128] | |
955 vmul.f32 q0, q0, q10 | |
956 vmul.f32 q0, q0, q1 | |
957 vst1.32 {q0},[r0,:128]! | |
958 subs r3, r3, #4 | |
959 bgt 3b | |
960 pop {pc} | |
961 .endfunc | |
962 | |
963 function ff_sv_fmul_scalar_2_neon, export=1 | |
964 VFP len .req r2 | |
965 NOVFP len .req r3 | |
966 VFP vdup.32 q8, d0[0] | |
967 NOVFP vdup.32 q8, r2 | |
968 ldr r12, [r1], #4 | |
969 vld1.32 {d0},[r12,:64] | |
970 ldr r12, [r1], #4 | |
971 vld1.32 {d1},[r12,:64] | |
972 1: vmul.f32 q1, q0, q8 | |
973 subs len, len, #4 | |
974 beq 2f | |
975 ldr r12, [r1], #4 | |
976 vld1.32 {d0},[r12,:64] | |
977 ldr r12, [r1], #4 | |
978 vld1.32 {d1},[r12,:64] | |
979 vst1.32 {q1},[r0,:128]! | |
980 b 1b | |
981 2: vst1.32 {q1},[r0,:128]! | |
982 bx lr | |
983 .unreq len | |
984 .endfunc | |
985 | |
986 function ff_sv_fmul_scalar_4_neon, export=1 | |
987 VFP len .req r2 | |
988 NOVFP len .req r3 | |
989 VFP vdup.32 q8, d0[0] | |
990 NOVFP vdup.32 q8, r2 | |
991 1: ldr r12, [r1], #4 | |
992 vld1.32 {q0},[r12,:128] | |
993 vmul.f32 q0, q0, q8 | |
994 vst1.32 {q0},[r0,:128]! | |
995 subs len, len, #4 | |
996 bgt 1b | |
997 bx lr | |
998 .unreq len | |
999 .endfunc | |
1000 | |
1001 function ff_butterflies_float_neon, export=1 | |
1002 1: vld1.32 {q0},[r0,:128] | |
1003 vld1.32 {q1},[r1,:128] | |
1004 vsub.f32 q2, q0, q1 | |
1005 vadd.f32 q1, q0, q1 | |
1006 vst1.32 {q2},[r1,:128]! | |
1007 vst1.32 {q1},[r0,:128]! | |
1008 subs r2, r2, #4 | |
1009 bgt 1b | |
1010 bx lr | |
1011 .endfunc | |
1012 | |
1013 function ff_scalarproduct_float_neon, export=1 | |
1014 vmov.f32 q2, #0.0 | |
1015 1: vld1.32 {q0},[r0,:128]! | |
1016 vld1.32 {q1},[r1,:128]! | |
1017 vmla.f32 q2, q0, q1 | |
1018 subs r2, r2, #4 | |
1019 bgt 1b | |
1020 vadd.f32 d0, d4, d5 | |
1021 vpadd.f32 d0, d0, d0 | |
1022 NOVFP vmov.32 r0, d0[0] | |
1023 bx lr | |
1024 .endfunc | |
1025 | |
1026 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1027 VFP vdup.32 q0, d0[0] | |
1028 VFP len .req r2 | |
1029 NOVFP vdup.32 q0, r2 | |
1030 NOVFP len .req r3 | |
1031 | |
1032 vld1.32 {q1},[r1,:128]! | |
1033 vcvt.f32.s32 q3, q1 | |
1034 vld1.32 {q2},[r1,:128]! | |
1035 vcvt.f32.s32 q8, q2 | |
1036 1: subs len, len, #8 | |
1037 pld [r1, #16] | |
1038 vmul.f32 q9, q3, q0 | |
1039 vmul.f32 q10, q8, q0 | |
1040 beq 2f | |
1041 vld1.32 {q1},[r1,:128]! | |
1042 vcvt.f32.s32 q3, q1 | |
1043 vld1.32 {q2},[r1,:128]! | |
1044 vcvt.f32.s32 q8, q2 | |
1045 vst1.32 {q9}, [r0,:128]! | |
1046 vst1.32 {q10},[r0,:128]! | |
1047 b 1b | |
1048 2: vst1.32 {q9}, [r0,:128]! | |
1049 vst1.32 {q10},[r0,:128]! | |
1050 bx lr | |
1051 .unreq len | |
1052 .endfunc | |
1053 | |
1054 function ff_vector_fmul_reverse_neon, export=1 | |
1055 add r2, r2, r3, lsl #2 | |
1056 sub r2, r2, #32 | |
1057 mov r12, #-32 | |
1058 vld1.32 {q0-q1}, [r1,:128]! | |
1059 vld1.32 {q2-q3}, [r2,:128], r12 | |
1060 1: pld [r1, #32] | |
1061 vrev64.32 q3, q3 | |
1062 vmul.f32 d16, d0, d7 | |
1063 vmul.f32 d17, d1, d6 | |
1064 pld [r2, #-32] | |
1065 vrev64.32 q2, q2 | |
1066 vmul.f32 d18, d2, d5 | |
1067 vmul.f32 d19, d3, d4 | |
1068 subs r3, r3, #8 | |
1069 beq 2f | |
1070 vld1.32 {q0-q1}, [r1,:128]! | |
1071 vld1.32 {q2-q3}, [r2,:128], r12 | |
1072 vst1.32 {q8-q9}, [r0,:128]! | |
1073 b 1b | |
1074 2: vst1.32 {q8-q9}, [r0,:128]! | |
1075 bx lr | |
1076 .endfunc | |
1077 | |
1078 function ff_vector_fmul_add_neon, export=1 | |
1079 ldr r12, [sp] | |
1080 vld1.32 {q0-q1}, [r1,:128]! | |
1081 vld1.32 {q8-q9}, [r2,:128]! | |
1082 vld1.32 {q2-q3}, [r3,:128]! | |
1083 vmul.f32 q10, q0, q8 | |
1084 vmul.f32 q11, q1, q9 | |
1085 1: vadd.f32 q12, q2, q10 | |
1086 vadd.f32 q13, q3, q11 | |
1087 pld [r1, #16] | |
1088 pld [r2, #16] | |
1089 pld [r3, #16] | |
1090 subs r12, r12, #8 | |
1091 beq 2f | |
1092 vld1.32 {q0}, [r1,:128]! | |
1093 vld1.32 {q8}, [r2,:128]! | |
1094 vmul.f32 q10, q0, q8 | |
1095 vld1.32 {q1}, [r1,:128]! | |
1096 vld1.32 {q9}, [r2,:128]! | |
1097 vmul.f32 q11, q1, q9 | |
1098 vld1.32 {q2-q3}, [r3,:128]! | |
1099 vst1.32 {q12-q13},[r0,:128]! | |
1100 b 1b | |
1101 2: vst1.32 {q12-q13},[r0,:128]! | |
1102 bx lr | |
1103 .endfunc | |
1104 | |
1105 function ff_vector_clipf_neon, export=1 | |
1106 VFP vdup.32 q1, d0[1] | |
1107 VFP vdup.32 q0, d0[0] | |
1108 NOVFP vdup.32 q0, r2 | |
1109 NOVFP vdup.32 q1, r3 | |
1110 NOVFP ldr r2, [sp] | |
1111 vld1.f32 {q2},[r1,:128]! | |
1112 vmin.f32 q10, q2, q1 | |
1113 vld1.f32 {q3},[r1,:128]! | |
1114 vmin.f32 q11, q3, q1 | |
1115 1: vmax.f32 q8, q10, q0 | |
1116 vmax.f32 q9, q11, q0 | |
1117 subs r2, r2, #8 | |
1118 beq 2f | |
1119 vld1.f32 {q2},[r1,:128]! | |
1120 vmin.f32 q10, q2, q1 | |
1121 vld1.f32 {q3},[r1,:128]! | |
1122 vmin.f32 q11, q3, q1 | |
1123 vst1.f32 {q8},[r0,:128]! | |
1124 vst1.f32 {q9},[r0,:128]! | |
1125 b 1b | |
1126 2: vst1.f32 {q8},[r0,:128]! | |
1127 vst1.f32 {q9},[r0,:128]! | |
1128 bx lr | |
1129 .endfunc |