8334
|
1 /*
|
|
2 * ARM NEON optimised DSP functions
|
|
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
4 *
|
|
5 * This file is part of FFmpeg.
|
|
6 *
|
|
7 * FFmpeg is free software; you can redistribute it and/or
|
|
8 * modify it under the terms of the GNU Lesser General Public
|
|
9 * License as published by the Free Software Foundation; either
|
|
10 * version 2.1 of the License, or (at your option) any later version.
|
|
11 *
|
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15 * Lesser General Public License for more details.
|
|
16 *
|
|
17 * You should have received a copy of the GNU Lesser General Public
|
|
18 * License along with FFmpeg; if not, write to the Free Software
|
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
20 */
|
|
21
|
|
22 #include "asm.S"
|
|
23
|
|
24 preserve8
|
|
25 .fpu neon
|
|
26 .text
|
|
27
|
|
28 .macro pixels16 avg=0
|
|
29 .if \avg
|
|
30 mov ip, r0
|
|
31 .endif
|
|
32 1: vld1.64 {d0, d1}, [r1], r2
|
|
33 vld1.64 {d2, d3}, [r1], r2
|
|
34 vld1.64 {d4, d5}, [r1], r2
|
|
35 pld [r1, r2, lsl #2]
|
|
36 vld1.64 {d6, d7}, [r1], r2
|
|
37 pld [r1]
|
|
38 pld [r1, r2]
|
|
39 pld [r1, r2, lsl #1]
|
|
40 .if \avg
|
|
41 vld1.64 {d16,d17}, [ip], r2
|
|
42 vrhadd.u8 q0, q0, q8
|
|
43 vld1.64 {d18,d19}, [ip], r2
|
|
44 vrhadd.u8 q1, q1, q9
|
|
45 vld1.64 {d20,d21}, [ip], r2
|
|
46 vrhadd.u8 q2, q2, q10
|
|
47 vld1.64 {d22,d23}, [ip], r2
|
|
48 vrhadd.u8 q3, q3, q11
|
|
49 .endif
|
|
50 subs r3, r3, #4
|
|
51 vst1.64 {d0, d1}, [r0,:128], r2
|
|
52 vst1.64 {d2, d3}, [r0,:128], r2
|
|
53 vst1.64 {d4, d5}, [r0,:128], r2
|
|
54 vst1.64 {d6, d7}, [r0,:128], r2
|
|
55 bne 1b
|
|
56 bx lr
|
|
57 .endm
|
|
58
|
|
59 .macro pixels16_x2 vhadd=vrhadd.u8
|
|
60 1: vld1.64 {d0-d2}, [r1], r2
|
|
61 vld1.64 {d4-d6}, [r1], r2
|
|
62 pld [r1]
|
|
63 pld [r1, r2]
|
|
64 subs r3, r3, #2
|
|
65 vext.8 q1, q0, q1, #1
|
|
66 \vhadd q0, q0, q1
|
|
67 vext.8 q3, q2, q3, #1
|
|
68 \vhadd q2, q2, q3
|
|
69 vst1.64 {d0, d1}, [r0,:128], r2
|
|
70 vst1.64 {d4, d5}, [r0,:128], r2
|
|
71 bne 1b
|
|
72 bx lr
|
|
73 .endm
|
|
74
|
|
75 .macro pixels16_y2 vhadd=vrhadd.u8
|
|
76 push {lr}
|
|
77 add ip, r1, r2
|
|
78 lsl lr, r2, #1
|
|
79 vld1.64 {d0, d1}, [r1], lr
|
|
80 vld1.64 {d2, d3}, [ip], lr
|
|
81 1: subs r3, r3, #2
|
|
82 \vhadd q2, q0, q1
|
|
83 vld1.64 {d0, d1}, [r1], lr
|
|
84 \vhadd q3, q0, q1
|
|
85 vld1.64 {d2, d3}, [ip], lr
|
|
86 pld [r1]
|
|
87 pld [ip]
|
|
88 vst1.64 {d4, d5}, [r0,:128], r2
|
|
89 vst1.64 {d6, d7}, [r0,:128], r2
|
|
90 bne 1b
|
|
91 pop {pc}
|
|
92 .endm
|
|
93
|
|
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
95 push {lr}
|
|
96 lsl lr, r2, #1
|
|
97 add ip, r1, r2
|
|
98 vld1.64 {d0-d2}, [r1], lr
|
|
99 vld1.64 {d4-d6}, [ip], lr
|
|
100 .if \no_rnd
|
|
101 vmov.i16 q13, #1
|
|
102 .endif
|
|
103 pld [r1]
|
|
104 pld [ip]
|
|
105 vext.8 q1, q0, q1, #1
|
|
106 vext.8 q3, q2, q3, #1
|
|
107 vaddl.u8 q8, d0, d2
|
|
108 vaddl.u8 q10, d1, d3
|
|
109 vaddl.u8 q9, d4, d6
|
|
110 vaddl.u8 q11, d5, d7
|
|
111 1: subs r3, r3, #2
|
|
112 vld1.64 {d0-d2}, [r1], lr
|
|
113 vadd.u16 q12, q8, q9
|
|
114 pld [r1]
|
|
115 .if \no_rnd
|
|
116 vadd.u16 q12, q12, q13
|
|
117 .endif
|
|
118 vext.8 q15, q0, q1, #1
|
|
119 vadd.u16 q1 , q10, q11
|
|
120 \vshrn d28, q12, #2
|
|
121 .if \no_rnd
|
|
122 vadd.u16 q1, q1, q13
|
|
123 .endif
|
|
124 \vshrn d29, q1, #2
|
|
125 vaddl.u8 q8, d0, d30
|
|
126 vld1.64 {d2-d4}, [ip], lr
|
|
127 vaddl.u8 q10, d1, d31
|
|
128 vst1.64 {d28,d29}, [r0,:128], r2
|
|
129 vadd.u16 q12, q8, q9
|
|
130 pld [ip]
|
|
131 .if \no_rnd
|
|
132 vadd.u16 q12, q12, q13
|
|
133 .endif
|
|
134 vext.8 q2, q1, q2, #1
|
|
135 vadd.u16 q0, q10, q11
|
|
136 \vshrn d30, q12, #2
|
|
137 .if \no_rnd
|
|
138 vadd.u16 q0, q0, q13
|
|
139 .endif
|
|
140 \vshrn d31, q0, #2
|
|
141 vaddl.u8 q9, d2, d4
|
|
142 vaddl.u8 q11, d3, d5
|
|
143 vst1.64 {d30,d31}, [r0,:128], r2
|
|
144 bgt 1b
|
|
145 pop {pc}
|
|
146 .endm
|
|
147
|
|
148 .macro pixels8
|
|
149 1: vld1.64 {d0}, [r1], r2
|
|
150 vld1.64 {d1}, [r1], r2
|
|
151 vld1.64 {d2}, [r1], r2
|
|
152 pld [r1, r2, lsl #2]
|
|
153 vld1.64 {d3}, [r1], r2
|
|
154 pld [r1]
|
|
155 pld [r1, r2]
|
|
156 pld [r1, r2, lsl #1]
|
|
157 subs r3, r3, #4
|
|
158 vst1.64 {d0}, [r0,:64], r2
|
|
159 vst1.64 {d1}, [r0,:64], r2
|
|
160 vst1.64 {d2}, [r0,:64], r2
|
|
161 vst1.64 {d3}, [r0,:64], r2
|
|
162 bne 1b
|
|
163 bx lr
|
|
164 .endm
|
|
165
|
|
166 .macro pixels8_x2 vhadd=vrhadd.u8
|
|
167 1: vld1.64 {d0, d1}, [r1], r2
|
|
168 vext.8 d1, d0, d1, #1
|
|
169 vld1.64 {d2, d3}, [r1], r2
|
|
170 vext.8 d3, d2, d3, #1
|
|
171 pld [r1]
|
|
172 pld [r1, r2]
|
|
173 subs r3, r3, #2
|
|
174 vswp d1, d2
|
|
175 \vhadd q0, q0, q1
|
|
176 vst1.64 {d0}, [r0,:64], r2
|
|
177 vst1.64 {d1}, [r0,:64], r2
|
|
178 bne 1b
|
|
179 bx lr
|
|
180 .endm
|
|
181
|
|
182 .macro pixels8_y2 vhadd=vrhadd.u8
|
|
183 push {lr}
|
|
184 add ip, r1, r2
|
|
185 lsl lr, r2, #1
|
|
186 vld1.64 {d0}, [r1], lr
|
|
187 vld1.64 {d1}, [ip], lr
|
|
188 1: subs r3, r3, #2
|
|
189 \vhadd d4, d0, d1
|
|
190 vld1.64 {d0}, [r1], lr
|
|
191 \vhadd d5, d0, d1
|
|
192 vld1.64 {d1}, [ip], lr
|
|
193 pld [r1]
|
|
194 pld [ip]
|
|
195 vst1.64 {d4}, [r0,:64], r2
|
|
196 vst1.64 {d5}, [r0,:64], r2
|
|
197 bne 1b
|
|
198 pop {pc}
|
|
199 .endm
|
|
200
|
|
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
202 push {lr}
|
|
203 lsl lr, r2, #1
|
|
204 add ip, r1, r2
|
|
205 vld1.64 {d0, d1}, [r1], lr
|
|
206 vld1.64 {d2, d3}, [ip], lr
|
|
207 .if \no_rnd
|
|
208 vmov.i16 q11, #1
|
|
209 .endif
|
|
210 pld [r1]
|
|
211 pld [ip]
|
|
212 vext.8 d4, d0, d1, #1
|
|
213 vext.8 d6, d2, d3, #1
|
|
214 vaddl.u8 q8, d0, d4
|
|
215 vaddl.u8 q9, d2, d6
|
|
216 1: subs r3, r3, #2
|
|
217 vld1.64 {d0, d1}, [r1], lr
|
|
218 pld [r1]
|
|
219 vadd.u16 q10, q8, q9
|
|
220 vext.8 d4, d0, d1, #1
|
|
221 .if \no_rnd
|
|
222 vadd.u16 q10, q10, q11
|
|
223 .endif
|
|
224 vaddl.u8 q8, d0, d4
|
|
225 \vshrn d5, q10, #2
|
|
226 vld1.64 {d2, d3}, [ip], lr
|
|
227 vadd.u16 q10, q8, q9
|
|
228 pld [ip]
|
|
229 .if \no_rnd
|
|
230 vadd.u16 q10, q10, q11
|
|
231 .endif
|
|
232 vst1.64 {d5}, [r0,:64], r2
|
|
233 \vshrn d7, q10, #2
|
|
234 vext.8 d6, d2, d3, #1
|
|
235 vaddl.u8 q9, d2, d6
|
|
236 vst1.64 {d7}, [r0,:64], r2
|
|
237 bgt 1b
|
|
238 pop {pc}
|
|
239 .endm
|
|
240
|
|
241 .macro pixfunc pfx name suf rnd_op args:vararg
|
|
242 function ff_\pfx\name\suf\()_neon, export=1
|
|
243 \name \rnd_op \args
|
|
244 .endfunc
|
|
245 .endm
|
|
246
|
|
247 .macro pixfunc2 pfx name args:vararg
|
|
248 pixfunc \pfx \name
|
|
249 pixfunc \pfx \name \args
|
|
250 .endm
|
|
251
|
|
252 function ff_put_h264_qpel16_mc00_neon, export=1
|
|
253 mov r3, #16
|
|
254 .endfunc
|
|
255
|
|
256 pixfunc put_ pixels16
|
|
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
|
|
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
|
|
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
|
|
260
|
|
261 function ff_avg_h264_qpel16_mc00_neon, export=1
|
|
262 mov r3, #16
|
|
263 .endfunc
|
|
264
|
|
265 pixfunc avg_ pixels16,, 1
|
|
266
|
|
267 function ff_put_h264_qpel8_mc00_neon, export=1
|
|
268 mov r3, #8
|
|
269 .endfunc
|
|
270
|
|
271 pixfunc put_ pixels8
|
|
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
|
|
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
|
|
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
|
8492
|
275
|
|
276 function ff_float_to_int16_neon, export=1
|
|
277 subs r2, r2, #8
|
|
278 vld1.64 {d0-d1}, [r1,:128]!
|
|
279 vcvt.s32.f32 q8, q0, #16
|
|
280 vld1.64 {d2-d3}, [r1,:128]!
|
|
281 vcvt.s32.f32 q9, q1, #16
|
|
282 beq 3f
|
|
283 bics ip, r2, #15
|
|
284 beq 2f
|
|
285 1: subs ip, ip, #16
|
|
286 vshrn.s32 d4, q8, #16
|
|
287 vld1.64 {d0-d1}, [r1,:128]!
|
|
288 vcvt.s32.f32 q0, q0, #16
|
|
289 vshrn.s32 d5, q9, #16
|
|
290 vld1.64 {d2-d3}, [r1,:128]!
|
|
291 vcvt.s32.f32 q1, q1, #16
|
|
292 vshrn.s32 d6, q0, #16
|
|
293 vst1.64 {d4-d5}, [r0,:128]!
|
|
294 vshrn.s32 d7, q1, #16
|
|
295 vld1.64 {d16-d17},[r1,:128]!
|
|
296 vcvt.s32.f32 q8, q8, #16
|
|
297 vld1.64 {d18-d19},[r1,:128]!
|
|
298 vcvt.s32.f32 q9, q9, #16
|
|
299 vst1.64 {d6-d7}, [r0,:128]!
|
|
300 bne 1b
|
|
301 ands r2, r2, #15
|
|
302 beq 3f
|
|
303 2: vld1.64 {d0-d1}, [r1,:128]!
|
|
304 vshrn.s32 d4, q8, #16
|
|
305 vcvt.s32.f32 q0, q0, #16
|
|
306 vld1.64 {d2-d3}, [r1,:128]!
|
|
307 vshrn.s32 d5, q9, #16
|
|
308 vcvt.s32.f32 q1, q1, #16
|
|
309 vshrn.s32 d6, q0, #16
|
|
310 vst1.64 {d4-d5}, [r0,:128]!
|
|
311 vshrn.s32 d7, q1, #16
|
|
312 vst1.64 {d6-d7}, [r0,:128]!
|
|
313 bx lr
|
|
314 3: vshrn.s32 d4, q8, #16
|
|
315 vshrn.s32 d5, q9, #16
|
|
316 vst1.64 {d4-d5}, [r0,:128]!
|
|
317 bx lr
|
|
318 .endfunc
|
|
319
|
|
320 function ff_float_to_int16_interleave_neon, export=1
|
|
321 cmp r3, #2
|
|
322 ldrlt r1, [r1]
|
|
323 blt ff_float_to_int16_neon
|
|
324 bne 4f
|
|
325
|
|
326 ldr r3, [r1]
|
|
327 ldr r1, [r1, #4]
|
|
328
|
|
329 subs r2, r2, #8
|
|
330 vld1.64 {d0-d1}, [r3,:128]!
|
|
331 vcvt.s32.f32 q8, q0, #16
|
|
332 vld1.64 {d2-d3}, [r3,:128]!
|
|
333 vcvt.s32.f32 q9, q1, #16
|
|
334 vld1.64 {d20-d21},[r1,:128]!
|
|
335 vcvt.s32.f32 q10, q10, #16
|
|
336 vld1.64 {d22-d23},[r1,:128]!
|
|
337 vcvt.s32.f32 q11, q11, #16
|
|
338 beq 3f
|
|
339 bics ip, r2, #15
|
|
340 beq 2f
|
|
341 1: subs ip, ip, #16
|
|
342 vld1.64 {d0-d1}, [r3,:128]!
|
|
343 vcvt.s32.f32 q0, q0, #16
|
|
344 vsri.32 q10, q8, #16
|
|
345 vld1.64 {d2-d3}, [r3,:128]!
|
|
346 vcvt.s32.f32 q1, q1, #16
|
|
347 vld1.64 {d24-d25},[r1,:128]!
|
|
348 vcvt.s32.f32 q12, q12, #16
|
|
349 vld1.64 {d26-d27},[r1,:128]!
|
|
350 vsri.32 q11, q9, #16
|
|
351 vst1.64 {d20-d21},[r0,:128]!
|
|
352 vcvt.s32.f32 q13, q13, #16
|
|
353 vst1.64 {d22-d23},[r0,:128]!
|
|
354 vsri.32 q12, q0, #16
|
|
355 vld1.64 {d16-d17},[r3,:128]!
|
|
356 vsri.32 q13, q1, #16
|
|
357 vst1.64 {d24-d25},[r0,:128]!
|
|
358 vcvt.s32.f32 q8, q8, #16
|
|
359 vld1.64 {d18-d19},[r3,:128]!
|
|
360 vcvt.s32.f32 q9, q9, #16
|
|
361 vld1.64 {d20-d21},[r1,:128]!
|
|
362 vcvt.s32.f32 q10, q10, #16
|
|
363 vld1.64 {d22-d23},[r1,:128]!
|
|
364 vcvt.s32.f32 q11, q11, #16
|
|
365 vst1.64 {d26-d27},[r0,:128]!
|
|
366 bne 1b
|
|
367 ands r2, r2, #15
|
|
368 beq 3f
|
|
369 2: vsri.32 q10, q8, #16
|
|
370 vld1.64 {d0-d1}, [r3,:128]!
|
|
371 vcvt.s32.f32 q0, q0, #16
|
|
372 vld1.64 {d2-d3}, [r3,:128]!
|
|
373 vcvt.s32.f32 q1, q1, #16
|
|
374 vld1.64 {d24-d25},[r1,:128]!
|
|
375 vcvt.s32.f32 q12, q12, #16
|
|
376 vsri.32 q11, q9, #16
|
|
377 vld1.64 {d26-d27},[r1,:128]!
|
|
378 vcvt.s32.f32 q13, q13, #16
|
|
379 vst1.64 {d20-d21},[r0,:128]!
|
|
380 vsri.32 q12, q0, #16
|
|
381 vst1.64 {d22-d23},[r0,:128]!
|
|
382 vsri.32 q13, q1, #16
|
|
383 vst1.64 {d24-d27},[r0,:128]!
|
|
384 bx lr
|
|
385 3: vsri.32 q10, q8, #16
|
|
386 vsri.32 q11, q9, #16
|
|
387 vst1.64 {d20-d23},[r0,:128]!
|
|
388 bx lr
|
|
389
|
|
390 4: push {r4-r8,lr}
|
|
391 cmp r3, #4
|
|
392 lsl ip, r3, #1
|
|
393 blt 4f
|
|
394
|
|
395 @ 4 channels
|
|
396 5: ldmia r1!, {r4-r7}
|
|
397 mov lr, r2
|
|
398 mov r8, r0
|
|
399 vld1.64 {d16-d17},[r4,:128]!
|
|
400 vcvt.s32.f32 q8, q8, #16
|
|
401 vld1.64 {d18-d19},[r5,:128]!
|
|
402 vcvt.s32.f32 q9, q9, #16
|
|
403 vld1.64 {d20-d21},[r6,:128]!
|
|
404 vcvt.s32.f32 q10, q10, #16
|
|
405 vld1.64 {d22-d23},[r7,:128]!
|
|
406 vcvt.s32.f32 q11, q11, #16
|
|
407 6: subs lr, lr, #8
|
|
408 vld1.64 {d0-d1}, [r4,:128]!
|
|
409 vcvt.s32.f32 q0, q0, #16
|
|
410 vsri.32 q9, q8, #16
|
|
411 vld1.64 {d2-d3}, [r5,:128]!
|
|
412 vcvt.s32.f32 q1, q1, #16
|
|
413 vsri.32 q11, q10, #16
|
|
414 vld1.64 {d4-d5}, [r6,:128]!
|
|
415 vcvt.s32.f32 q2, q2, #16
|
|
416 vzip.32 d18, d22
|
|
417 vld1.64 {d6-d7}, [r7,:128]!
|
|
418 vcvt.s32.f32 q3, q3, #16
|
|
419 vzip.32 d19, d23
|
|
420 vst1.64 {d18}, [r8], ip
|
|
421 vsri.32 q1, q0, #16
|
|
422 vst1.64 {d22}, [r8], ip
|
|
423 vsri.32 q3, q2, #16
|
|
424 vst1.64 {d19}, [r8], ip
|
|
425 vzip.32 d2, d6
|
|
426 vst1.64 {d23}, [r8], ip
|
|
427 vzip.32 d3, d7
|
|
428 beq 7f
|
|
429 vld1.64 {d16-d17},[r4,:128]!
|
|
430 vcvt.s32.f32 q8, q8, #16
|
|
431 vst1.64 {d2}, [r8], ip
|
|
432 vld1.64 {d18-d19},[r5,:128]!
|
|
433 vcvt.s32.f32 q9, q9, #16
|
|
434 vst1.64 {d6}, [r8], ip
|
|
435 vld1.64 {d20-d21},[r6,:128]!
|
|
436 vcvt.s32.f32 q10, q10, #16
|
|
437 vst1.64 {d3}, [r8], ip
|
|
438 vld1.64 {d22-d23},[r7,:128]!
|
|
439 vcvt.s32.f32 q11, q11, #16
|
|
440 vst1.64 {d7}, [r8], ip
|
|
441 b 6b
|
|
442 7: vst1.64 {d2}, [r8], ip
|
|
443 vst1.64 {d6}, [r8], ip
|
|
444 vst1.64 {d3}, [r8], ip
|
|
445 vst1.64 {d7}, [r8], ip
|
|
446 subs r3, r3, #4
|
|
447 popeq {r4-r8,pc}
|
|
448 cmp r3, #4
|
|
449 add r0, r0, #8
|
|
450 bge 5b
|
|
451
|
|
452 @ 2 channels
|
|
453 4: cmp r3, #2
|
|
454 blt 4f
|
|
455 ldmia r1!, {r4-r5}
|
|
456 mov lr, r2
|
|
457 mov r8, r0
|
|
458 tst lr, #8
|
|
459 vld1.64 {d16-d17},[r4,:128]!
|
|
460 vcvt.s32.f32 q8, q8, #16
|
|
461 vld1.64 {d18-d19},[r5,:128]!
|
|
462 vcvt.s32.f32 q9, q9, #16
|
|
463 vld1.64 {d20-d21},[r4,:128]!
|
|
464 vcvt.s32.f32 q10, q10, #16
|
|
465 vld1.64 {d22-d23},[r5,:128]!
|
|
466 vcvt.s32.f32 q11, q11, #16
|
|
467 beq 6f
|
|
468 subs lr, lr, #8
|
|
469 beq 7f
|
|
470 vsri.32 d18, d16, #16
|
|
471 vsri.32 d19, d17, #16
|
|
472 vld1.64 {d16-d17},[r4,:128]!
|
|
473 vcvt.s32.f32 q8, q8, #16
|
|
474 vst1.32 {d18[0]}, [r8], ip
|
|
475 vsri.32 d22, d20, #16
|
|
476 vst1.32 {d18[1]}, [r8], ip
|
|
477 vsri.32 d23, d21, #16
|
|
478 vst1.32 {d19[0]}, [r8], ip
|
|
479 vst1.32 {d19[1]}, [r8], ip
|
|
480 vld1.64 {d18-d19},[r5,:128]!
|
|
481 vcvt.s32.f32 q9, q9, #16
|
|
482 vst1.32 {d22[0]}, [r8], ip
|
|
483 vst1.32 {d22[1]}, [r8], ip
|
|
484 vld1.64 {d20-d21},[r4,:128]!
|
|
485 vcvt.s32.f32 q10, q10, #16
|
|
486 vst1.32 {d23[0]}, [r8], ip
|
|
487 vst1.32 {d23[1]}, [r8], ip
|
|
488 vld1.64 {d22-d23},[r5,:128]!
|
|
489 vcvt.s32.f32 q11, q11, #16
|
|
490 6: subs lr, lr, #16
|
|
491 vld1.64 {d0-d1}, [r4,:128]!
|
|
492 vcvt.s32.f32 q0, q0, #16
|
|
493 vsri.32 d18, d16, #16
|
|
494 vld1.64 {d2-d3}, [r5,:128]!
|
|
495 vcvt.s32.f32 q1, q1, #16
|
|
496 vsri.32 d19, d17, #16
|
|
497 vld1.64 {d4-d5}, [r4,:128]!
|
|
498 vcvt.s32.f32 q2, q2, #16
|
|
499 vld1.64 {d6-d7}, [r5,:128]!
|
|
500 vcvt.s32.f32 q3, q3, #16
|
|
501 vst1.32 {d18[0]}, [r8], ip
|
|
502 vsri.32 d22, d20, #16
|
|
503 vst1.32 {d18[1]}, [r8], ip
|
|
504 vsri.32 d23, d21, #16
|
|
505 vst1.32 {d19[0]}, [r8], ip
|
|
506 vsri.32 d2, d0, #16
|
|
507 vst1.32 {d19[1]}, [r8], ip
|
|
508 vsri.32 d3, d1, #16
|
|
509 vst1.32 {d22[0]}, [r8], ip
|
|
510 vsri.32 d6, d4, #16
|
|
511 vst1.32 {d22[1]}, [r8], ip
|
|
512 vsri.32 d7, d5, #16
|
|
513 vst1.32 {d23[0]}, [r8], ip
|
|
514 vst1.32 {d23[1]}, [r8], ip
|
|
515 beq 6f
|
|
516 vld1.64 {d16-d17},[r4,:128]!
|
|
517 vcvt.s32.f32 q8, q8, #16
|
|
518 vst1.32 {d2[0]}, [r8], ip
|
|
519 vst1.32 {d2[1]}, [r8], ip
|
|
520 vld1.64 {d18-d19},[r5,:128]!
|
|
521 vcvt.s32.f32 q9, q9, #16
|
|
522 vst1.32 {d3[0]}, [r8], ip
|
|
523 vst1.32 {d3[1]}, [r8], ip
|
|
524 vld1.64 {d20-d21},[r4,:128]!
|
|
525 vcvt.s32.f32 q10, q10, #16
|
|
526 vst1.32 {d6[0]}, [r8], ip
|
|
527 vst1.32 {d6[1]}, [r8], ip
|
|
528 vld1.64 {d22-d23},[r5,:128]!
|
|
529 vcvt.s32.f32 q11, q11, #16
|
|
530 vst1.32 {d7[0]}, [r8], ip
|
|
531 vst1.32 {d7[1]}, [r8], ip
|
|
532 bgt 6b
|
|
533 6: vst1.32 {d2[0]}, [r8], ip
|
|
534 vst1.32 {d2[1]}, [r8], ip
|
|
535 vst1.32 {d3[0]}, [r8], ip
|
|
536 vst1.32 {d3[1]}, [r8], ip
|
|
537 vst1.32 {d6[0]}, [r8], ip
|
|
538 vst1.32 {d6[1]}, [r8], ip
|
|
539 vst1.32 {d7[0]}, [r8], ip
|
|
540 vst1.32 {d7[1]}, [r8], ip
|
|
541 b 8f
|
|
542 7: vsri.32 d18, d16, #16
|
|
543 vsri.32 d19, d17, #16
|
|
544 vst1.32 {d18[0]}, [r8], ip
|
|
545 vsri.32 d22, d20, #16
|
|
546 vst1.32 {d18[1]}, [r8], ip
|
|
547 vsri.32 d23, d21, #16
|
|
548 vst1.32 {d19[0]}, [r8], ip
|
|
549 vst1.32 {d19[1]}, [r8], ip
|
|
550 vst1.32 {d22[0]}, [r8], ip
|
|
551 vst1.32 {d22[1]}, [r8], ip
|
|
552 vst1.32 {d23[0]}, [r8], ip
|
|
553 vst1.32 {d23[1]}, [r8], ip
|
|
554 8: subs r3, r3, #2
|
|
555 add r0, r0, #4
|
|
556 popeq {r4-r8,pc}
|
|
557
|
|
558 @ 1 channel
|
|
559 4: ldr r4, [r1],#4
|
|
560 tst r2, #8
|
|
561 mov lr, r2
|
|
562 mov r5, r0
|
|
563 vld1.64 {d0-d1}, [r4,:128]!
|
|
564 vcvt.s32.f32 q0, q0, #16
|
|
565 vld1.64 {d2-d3}, [r4,:128]!
|
|
566 vcvt.s32.f32 q1, q1, #16
|
|
567 bne 8f
|
|
568 6: subs lr, lr, #16
|
|
569 vld1.64 {d4-d5}, [r4,:128]!
|
|
570 vcvt.s32.f32 q2, q2, #16
|
|
571 vld1.64 {d6-d7}, [r4,:128]!
|
|
572 vcvt.s32.f32 q3, q3, #16
|
|
573 vst1.16 {d0[1]}, [r5,:16], ip
|
|
574 vst1.16 {d0[3]}, [r5,:16], ip
|
|
575 vst1.16 {d1[1]}, [r5,:16], ip
|
|
576 vst1.16 {d1[3]}, [r5,:16], ip
|
|
577 vst1.16 {d2[1]}, [r5,:16], ip
|
|
578 vst1.16 {d2[3]}, [r5,:16], ip
|
|
579 vst1.16 {d3[1]}, [r5,:16], ip
|
|
580 vst1.16 {d3[3]}, [r5,:16], ip
|
|
581 beq 7f
|
|
582 vld1.64 {d0-d1}, [r4,:128]!
|
|
583 vcvt.s32.f32 q0, q0, #16
|
|
584 vld1.64 {d2-d3}, [r4,:128]!
|
|
585 vcvt.s32.f32 q1, q1, #16
|
|
586 7: vst1.16 {d4[1]}, [r5,:16], ip
|
|
587 vst1.16 {d4[3]}, [r5,:16], ip
|
|
588 vst1.16 {d5[1]}, [r5,:16], ip
|
|
589 vst1.16 {d5[3]}, [r5,:16], ip
|
|
590 vst1.16 {d6[1]}, [r5,:16], ip
|
|
591 vst1.16 {d6[3]}, [r5,:16], ip
|
|
592 vst1.16 {d7[1]}, [r5,:16], ip
|
|
593 vst1.16 {d7[3]}, [r5,:16], ip
|
|
594 bgt 6b
|
|
595 pop {r4-r8,pc}
|
|
596 8: subs lr, lr, #8
|
|
597 vst1.16 {d0[1]}, [r5,:16], ip
|
|
598 vst1.16 {d0[3]}, [r5,:16], ip
|
|
599 vst1.16 {d1[1]}, [r5,:16], ip
|
|
600 vst1.16 {d1[3]}, [r5,:16], ip
|
|
601 vst1.16 {d2[1]}, [r5,:16], ip
|
|
602 vst1.16 {d2[3]}, [r5,:16], ip
|
|
603 vst1.16 {d3[1]}, [r5,:16], ip
|
|
604 vst1.16 {d3[3]}, [r5,:16], ip
|
|
605 popeq {r4-r8,pc}
|
|
606 vld1.64 {d0-d1}, [r4,:128]!
|
|
607 vcvt.s32.f32 q0, q0, #16
|
|
608 vld1.64 {d2-d3}, [r4,:128]!
|
|
609 vcvt.s32.f32 q1, q1, #16
|
|
610 b 6b
|
|
611 .endfunc
|
8697
|
612
|
|
613 function ff_vector_fmul_neon, export=1
|
|
614 mov r3, r0
|
|
615 subs r2, r2, #8
|
|
616 vld1.64 {d0-d3}, [r0,:128]!
|
|
617 vld1.64 {d4-d7}, [r1,:128]!
|
|
618 vmul.f32 q8, q0, q2
|
|
619 vmul.f32 q9, q1, q3
|
|
620 beq 3f
|
|
621 bics ip, r2, #15
|
|
622 beq 2f
|
|
623 1: subs ip, ip, #16
|
|
624 vld1.64 {d0-d1}, [r0,:128]!
|
|
625 vld1.64 {d4-d5}, [r1,:128]!
|
|
626 vmul.f32 q10, q0, q2
|
|
627 vld1.64 {d2-d3}, [r0,:128]!
|
|
628 vld1.64 {d6-d7}, [r1,:128]!
|
|
629 vmul.f32 q11, q1, q3
|
|
630 vst1.64 {d16-d19},[r3,:128]!
|
|
631 vld1.64 {d0-d1}, [r0,:128]!
|
|
632 vld1.64 {d4-d5}, [r1,:128]!
|
|
633 vmul.f32 q8, q0, q2
|
|
634 vld1.64 {d2-d3}, [r0,:128]!
|
|
635 vld1.64 {d6-d7}, [r1,:128]!
|
|
636 vmul.f32 q9, q1, q3
|
|
637 vst1.64 {d20-d23},[r3,:128]!
|
|
638 bne 1b
|
|
639 ands r2, r2, #15
|
|
640 beq 3f
|
|
641 2: vld1.64 {d0-d1}, [r0,:128]!
|
|
642 vld1.64 {d4-d5}, [r1,:128]!
|
|
643 vst1.64 {d16-d17},[r3,:128]!
|
|
644 vmul.f32 q8, q0, q2
|
|
645 vld1.64 {d2-d3}, [r0,:128]!
|
|
646 vld1.64 {d6-d7}, [r1,:128]!
|
|
647 vst1.64 {d18-d19},[r3,:128]!
|
|
648 vmul.f32 q9, q1, q3
|
|
649 3: vst1.64 {d16-d19},[r3,:128]!
|
|
650 bx lr
|
|
651 .endfunc
|
8698
|
652
|
|
653 function ff_vector_fmul_window_neon, export=1
|
|
654 vld1.32 {d16[],d17[]}, [sp,:32]
|
|
655 push {r4,r5,lr}
|
|
656 ldr lr, [sp, #16]
|
|
657 sub r2, r2, #8
|
|
658 sub r5, lr, #2
|
|
659 add r2, r2, r5, lsl #2
|
|
660 add r4, r3, r5, lsl #3
|
|
661 add ip, r0, r5, lsl #3
|
|
662 mov r5, #-16
|
|
663 vld1.64 {d0,d1}, [r1,:128]!
|
|
664 vld1.64 {d2,d3}, [r2,:128], r5
|
|
665 vld1.64 {d4,d5}, [r3,:128]!
|
|
666 vld1.64 {d6,d7}, [r4,:128], r5
|
|
667 1: subs lr, lr, #4
|
|
668 vmov q11, q8
|
|
669 vmla.f32 d22, d0, d4
|
|
670 vmov q10, q8
|
|
671 vmla.f32 d23, d1, d5
|
|
672 vrev64.32 q3, q3
|
|
673 vmla.f32 d20, d0, d7
|
|
674 vrev64.32 q1, q1
|
|
675 vmla.f32 d21, d1, d6
|
|
676 beq 2f
|
|
677 vmla.f32 d22, d3, d7
|
|
678 vld1.64 {d0,d1}, [r1,:128]!
|
|
679 vmla.f32 d23, d2, d6
|
|
680 vld1.64 {d18,d19},[r2,:128], r5
|
|
681 vmls.f32 d20, d3, d4
|
|
682 vld1.64 {d24,d25},[r3,:128]!
|
|
683 vmls.f32 d21, d2, d5
|
|
684 vld1.64 {d6,d7}, [r4,:128], r5
|
|
685 vmov q1, q9
|
|
686 vrev64.32 q11, q11
|
|
687 vmov q2, q12
|
|
688 vswp d22, d23
|
|
689 vst1.64 {d20,d21},[r0,:128]!
|
|
690 vst1.64 {d22,d23},[ip,:128], r5
|
|
691 b 1b
|
|
692 2: vmla.f32 d22, d3, d7
|
|
693 vmla.f32 d23, d2, d6
|
|
694 vmls.f32 d20, d3, d4
|
|
695 vmls.f32 d21, d2, d5
|
|
696 vrev64.32 q11, q11
|
|
697 vswp d22, d23
|
|
698 vst1.64 {d20,d21},[r0,:128]!
|
|
699 vst1.64 {d22,d23},[ip,:128], r5
|
|
700 pop {r4,r5,pc}
|
|
701 .endfunc
|