8334
|
1 /*
|
|
2 * ARM NEON optimised DSP functions
|
|
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
4 *
|
|
5 * This file is part of FFmpeg.
|
|
6 *
|
|
7 * FFmpeg is free software; you can redistribute it and/or
|
|
8 * modify it under the terms of the GNU Lesser General Public
|
|
9 * License as published by the Free Software Foundation; either
|
|
10 * version 2.1 of the License, or (at your option) any later version.
|
|
11 *
|
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15 * Lesser General Public License for more details.
|
|
16 *
|
|
17 * You should have received a copy of the GNU Lesser General Public
|
|
18 * License along with FFmpeg; if not, write to the Free Software
|
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
20 */
|
|
21
|
|
22 #include "asm.S"
|
|
23
|
|
24 preserve8
|
|
25 .fpu neon
|
|
26 .text
|
|
27
|
|
28 .macro pixels16 avg=0
|
|
29 .if \avg
|
|
30 mov ip, r0
|
|
31 .endif
|
|
32 1: vld1.64 {d0, d1}, [r1], r2
|
|
33 vld1.64 {d2, d3}, [r1], r2
|
|
34 vld1.64 {d4, d5}, [r1], r2
|
|
35 pld [r1, r2, lsl #2]
|
|
36 vld1.64 {d6, d7}, [r1], r2
|
|
37 pld [r1]
|
|
38 pld [r1, r2]
|
|
39 pld [r1, r2, lsl #1]
|
|
40 .if \avg
|
|
41 vld1.64 {d16,d17}, [ip], r2
|
|
42 vrhadd.u8 q0, q0, q8
|
|
43 vld1.64 {d18,d19}, [ip], r2
|
|
44 vrhadd.u8 q1, q1, q9
|
|
45 vld1.64 {d20,d21}, [ip], r2
|
|
46 vrhadd.u8 q2, q2, q10
|
|
47 vld1.64 {d22,d23}, [ip], r2
|
|
48 vrhadd.u8 q3, q3, q11
|
|
49 .endif
|
|
50 subs r3, r3, #4
|
|
51 vst1.64 {d0, d1}, [r0,:128], r2
|
|
52 vst1.64 {d2, d3}, [r0,:128], r2
|
|
53 vst1.64 {d4, d5}, [r0,:128], r2
|
|
54 vst1.64 {d6, d7}, [r0,:128], r2
|
|
55 bne 1b
|
|
56 bx lr
|
|
57 .endm
|
|
58
|
|
59 .macro pixels16_x2 vhadd=vrhadd.u8
|
|
60 1: vld1.64 {d0-d2}, [r1], r2
|
|
61 vld1.64 {d4-d6}, [r1], r2
|
|
62 pld [r1]
|
|
63 pld [r1, r2]
|
|
64 subs r3, r3, #2
|
|
65 vext.8 q1, q0, q1, #1
|
|
66 \vhadd q0, q0, q1
|
|
67 vext.8 q3, q2, q3, #1
|
|
68 \vhadd q2, q2, q3
|
|
69 vst1.64 {d0, d1}, [r0,:128], r2
|
|
70 vst1.64 {d4, d5}, [r0,:128], r2
|
|
71 bne 1b
|
|
72 bx lr
|
|
73 .endm
|
|
74
|
|
75 .macro pixels16_y2 vhadd=vrhadd.u8
|
|
76 push {lr}
|
|
77 add ip, r1, r2
|
|
78 lsl lr, r2, #1
|
|
79 vld1.64 {d0, d1}, [r1], lr
|
|
80 vld1.64 {d2, d3}, [ip], lr
|
|
81 1: subs r3, r3, #2
|
|
82 \vhadd q2, q0, q1
|
|
83 vld1.64 {d0, d1}, [r1], lr
|
|
84 \vhadd q3, q0, q1
|
|
85 vld1.64 {d2, d3}, [ip], lr
|
|
86 pld [r1]
|
|
87 pld [ip]
|
|
88 vst1.64 {d4, d5}, [r0,:128], r2
|
|
89 vst1.64 {d6, d7}, [r0,:128], r2
|
|
90 bne 1b
|
|
91 pop {pc}
|
|
92 .endm
|
|
93
|
|
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
95 push {lr}
|
|
96 lsl lr, r2, #1
|
|
97 add ip, r1, r2
|
|
98 vld1.64 {d0-d2}, [r1], lr
|
|
99 vld1.64 {d4-d6}, [ip], lr
|
|
100 .if \no_rnd
|
|
101 vmov.i16 q13, #1
|
|
102 .endif
|
|
103 pld [r1]
|
|
104 pld [ip]
|
|
105 vext.8 q1, q0, q1, #1
|
|
106 vext.8 q3, q2, q3, #1
|
|
107 vaddl.u8 q8, d0, d2
|
|
108 vaddl.u8 q10, d1, d3
|
|
109 vaddl.u8 q9, d4, d6
|
|
110 vaddl.u8 q11, d5, d7
|
|
111 1: subs r3, r3, #2
|
|
112 vld1.64 {d0-d2}, [r1], lr
|
|
113 vadd.u16 q12, q8, q9
|
|
114 pld [r1]
|
|
115 .if \no_rnd
|
|
116 vadd.u16 q12, q12, q13
|
|
117 .endif
|
|
118 vext.8 q15, q0, q1, #1
|
|
119 vadd.u16 q1 , q10, q11
|
|
120 \vshrn d28, q12, #2
|
|
121 .if \no_rnd
|
|
122 vadd.u16 q1, q1, q13
|
|
123 .endif
|
|
124 \vshrn d29, q1, #2
|
|
125 vaddl.u8 q8, d0, d30
|
|
126 vld1.64 {d2-d4}, [ip], lr
|
|
127 vaddl.u8 q10, d1, d31
|
|
128 vst1.64 {d28,d29}, [r0,:128], r2
|
|
129 vadd.u16 q12, q8, q9
|
|
130 pld [ip]
|
|
131 .if \no_rnd
|
|
132 vadd.u16 q12, q12, q13
|
|
133 .endif
|
|
134 vext.8 q2, q1, q2, #1
|
|
135 vadd.u16 q0, q10, q11
|
|
136 \vshrn d30, q12, #2
|
|
137 .if \no_rnd
|
|
138 vadd.u16 q0, q0, q13
|
|
139 .endif
|
|
140 \vshrn d31, q0, #2
|
|
141 vaddl.u8 q9, d2, d4
|
|
142 vaddl.u8 q11, d3, d5
|
|
143 vst1.64 {d30,d31}, [r0,:128], r2
|
|
144 bgt 1b
|
|
145 pop {pc}
|
|
146 .endm
|
|
147
|
|
148 .macro pixels8
|
|
149 1: vld1.64 {d0}, [r1], r2
|
|
150 vld1.64 {d1}, [r1], r2
|
|
151 vld1.64 {d2}, [r1], r2
|
|
152 pld [r1, r2, lsl #2]
|
|
153 vld1.64 {d3}, [r1], r2
|
|
154 pld [r1]
|
|
155 pld [r1, r2]
|
|
156 pld [r1, r2, lsl #1]
|
|
157 subs r3, r3, #4
|
|
158 vst1.64 {d0}, [r0,:64], r2
|
|
159 vst1.64 {d1}, [r0,:64], r2
|
|
160 vst1.64 {d2}, [r0,:64], r2
|
|
161 vst1.64 {d3}, [r0,:64], r2
|
|
162 bne 1b
|
|
163 bx lr
|
|
164 .endm
|
|
165
|
|
166 .macro pixels8_x2 vhadd=vrhadd.u8
|
|
167 1: vld1.64 {d0, d1}, [r1], r2
|
|
168 vext.8 d1, d0, d1, #1
|
|
169 vld1.64 {d2, d3}, [r1], r2
|
|
170 vext.8 d3, d2, d3, #1
|
|
171 pld [r1]
|
|
172 pld [r1, r2]
|
|
173 subs r3, r3, #2
|
|
174 vswp d1, d2
|
|
175 \vhadd q0, q0, q1
|
|
176 vst1.64 {d0}, [r0,:64], r2
|
|
177 vst1.64 {d1}, [r0,:64], r2
|
|
178 bne 1b
|
|
179 bx lr
|
|
180 .endm
|
|
181
|
|
182 .macro pixels8_y2 vhadd=vrhadd.u8
|
|
183 push {lr}
|
|
184 add ip, r1, r2
|
|
185 lsl lr, r2, #1
|
|
186 vld1.64 {d0}, [r1], lr
|
|
187 vld1.64 {d1}, [ip], lr
|
|
188 1: subs r3, r3, #2
|
|
189 \vhadd d4, d0, d1
|
|
190 vld1.64 {d0}, [r1], lr
|
|
191 \vhadd d5, d0, d1
|
|
192 vld1.64 {d1}, [ip], lr
|
|
193 pld [r1]
|
|
194 pld [ip]
|
|
195 vst1.64 {d4}, [r0,:64], r2
|
|
196 vst1.64 {d5}, [r0,:64], r2
|
|
197 bne 1b
|
|
198 pop {pc}
|
|
199 .endm
|
|
200
|
|
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
|
|
202 push {lr}
|
|
203 lsl lr, r2, #1
|
|
204 add ip, r1, r2
|
|
205 vld1.64 {d0, d1}, [r1], lr
|
|
206 vld1.64 {d2, d3}, [ip], lr
|
|
207 .if \no_rnd
|
|
208 vmov.i16 q11, #1
|
|
209 .endif
|
|
210 pld [r1]
|
|
211 pld [ip]
|
|
212 vext.8 d4, d0, d1, #1
|
|
213 vext.8 d6, d2, d3, #1
|
|
214 vaddl.u8 q8, d0, d4
|
|
215 vaddl.u8 q9, d2, d6
|
|
216 1: subs r3, r3, #2
|
|
217 vld1.64 {d0, d1}, [r1], lr
|
|
218 pld [r1]
|
|
219 vadd.u16 q10, q8, q9
|
|
220 vext.8 d4, d0, d1, #1
|
|
221 .if \no_rnd
|
|
222 vadd.u16 q10, q10, q11
|
|
223 .endif
|
|
224 vaddl.u8 q8, d0, d4
|
|
225 \vshrn d5, q10, #2
|
|
226 vld1.64 {d2, d3}, [ip], lr
|
|
227 vadd.u16 q10, q8, q9
|
|
228 pld [ip]
|
|
229 .if \no_rnd
|
|
230 vadd.u16 q10, q10, q11
|
|
231 .endif
|
|
232 vst1.64 {d5}, [r0,:64], r2
|
|
233 \vshrn d7, q10, #2
|
|
234 vext.8 d6, d2, d3, #1
|
|
235 vaddl.u8 q9, d2, d6
|
|
236 vst1.64 {d7}, [r0,:64], r2
|
|
237 bgt 1b
|
|
238 pop {pc}
|
|
239 .endm
|
|
240
|
|
241 .macro pixfunc pfx name suf rnd_op args:vararg
|
|
242 function ff_\pfx\name\suf\()_neon, export=1
|
|
243 \name \rnd_op \args
|
|
244 .endfunc
|
|
245 .endm
|
|
246
|
|
247 .macro pixfunc2 pfx name args:vararg
|
|
248 pixfunc \pfx \name
|
|
249 pixfunc \pfx \name \args
|
|
250 .endm
|
|
251
|
|
252 function ff_put_h264_qpel16_mc00_neon, export=1
|
|
253 mov r3, #16
|
|
254 .endfunc
|
|
255
|
|
256 pixfunc put_ pixels16
|
|
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
|
|
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
|
|
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
|
|
260
|
|
261 function ff_avg_h264_qpel16_mc00_neon, export=1
|
|
262 mov r3, #16
|
|
263 .endfunc
|
|
264
|
|
265 pixfunc avg_ pixels16,, 1
|
|
266
|
|
267 function ff_put_h264_qpel8_mc00_neon, export=1
|
|
268 mov r3, #8
|
|
269 .endfunc
|
|
270
|
|
271 pixfunc put_ pixels8
|
|
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
|
|
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
|
|
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
|
8492
|
275
|
9345
|
276 function ff_put_signed_pixels_clamped_neon, export=1
|
|
277 vmov.u8 d31, #128
|
|
278 vld1.64 {d16-d17}, [r0,:128]!
|
|
279 vqmovn.s16 d0, q8
|
|
280 vld1.64 {d18-d19}, [r0,:128]!
|
|
281 vqmovn.s16 d1, q9
|
|
282 vld1.64 {d16-d17}, [r0,:128]!
|
|
283 vqmovn.s16 d2, q8
|
|
284 vld1.64 {d18-d19}, [r0,:128]!
|
|
285 vadd.u8 d0, d0, d31
|
|
286 vld1.64 {d20-d21}, [r0,:128]!
|
|
287 vadd.u8 d1, d1, d31
|
|
288 vld1.64 {d22-d23}, [r0,:128]!
|
|
289 vadd.u8 d2, d2, d31
|
|
290 vst1.64 {d0}, [r1,:64], r2
|
|
291 vqmovn.s16 d3, q9
|
|
292 vst1.64 {d1}, [r1,:64], r2
|
|
293 vqmovn.s16 d4, q10
|
|
294 vst1.64 {d2}, [r1,:64], r2
|
|
295 vqmovn.s16 d5, q11
|
|
296 vld1.64 {d24-d25}, [r0,:128]!
|
|
297 vadd.u8 d3, d3, d31
|
|
298 vld1.64 {d26-d27}, [r0,:128]!
|
|
299 vadd.u8 d4, d4, d31
|
|
300 vadd.u8 d5, d5, d31
|
|
301 vst1.64 {d3}, [r1,:64], r2
|
|
302 vqmovn.s16 d6, q12
|
|
303 vst1.64 {d4}, [r1,:64], r2
|
|
304 vqmovn.s16 d7, q13
|
|
305 vst1.64 {d5}, [r1,:64], r2
|
|
306 vadd.u8 d6, d6, d31
|
|
307 vadd.u8 d7, d7, d31
|
|
308 vst1.64 {d6}, [r1,:64], r2
|
|
309 vst1.64 {d7}, [r1,:64], r2
|
|
310 bx lr
|
|
311 .endfunc
|
|
312
|
9344
|
313 function ff_add_pixels_clamped_neon, export=1
|
|
314 mov r3, r1
|
|
315 vld1.64 {d16}, [r1,:64], r2
|
|
316 vld1.64 {d0-d1}, [r0,:128]!
|
|
317 vaddw.u8 q0, q0, d16
|
|
318 vld1.64 {d17}, [r1,:64], r2
|
|
319 vld1.64 {d2-d3}, [r0,:128]!
|
|
320 vqmovun.s16 d0, q0
|
|
321 vld1.64 {d18}, [r1,:64], r2
|
|
322 vaddw.u8 q1, q1, d17
|
|
323 vld1.64 {d4-d5}, [r0,:128]!
|
|
324 vaddw.u8 q2, q2, d18
|
|
325 vst1.64 {d0}, [r3,:64], r2
|
|
326 vqmovun.s16 d2, q1
|
|
327 vld1.64 {d19}, [r1,:64], r2
|
|
328 vld1.64 {d6-d7}, [r0,:128]!
|
|
329 vaddw.u8 q3, q3, d19
|
|
330 vqmovun.s16 d4, q2
|
|
331 vst1.64 {d2}, [r3,:64], r2
|
|
332 vld1.64 {d16}, [r1,:64], r2
|
|
333 vqmovun.s16 d6, q3
|
|
334 vld1.64 {d0-d1}, [r0,:128]!
|
|
335 vaddw.u8 q0, q0, d16
|
|
336 vst1.64 {d4}, [r3,:64], r2
|
|
337 vld1.64 {d17}, [r1,:64], r2
|
|
338 vld1.64 {d2-d3}, [r0,:128]!
|
|
339 vaddw.u8 q1, q1, d17
|
|
340 vst1.64 {d6}, [r3,:64], r2
|
|
341 vqmovun.s16 d0, q0
|
|
342 vld1.64 {d18}, [r1,:64], r2
|
|
343 vld1.64 {d4-d5}, [r0,:128]!
|
|
344 vaddw.u8 q2, q2, d18
|
|
345 vst1.64 {d0}, [r3,:64], r2
|
|
346 vqmovun.s16 d2, q1
|
|
347 vld1.64 {d19}, [r1,:64], r2
|
|
348 vqmovun.s16 d4, q2
|
|
349 vld1.64 {d6-d7}, [r0,:128]!
|
|
350 vaddw.u8 q3, q3, d19
|
|
351 vst1.64 {d2}, [r3,:64], r2
|
|
352 vqmovun.s16 d6, q3
|
|
353 vst1.64 {d4}, [r3,:64], r2
|
|
354 vst1.64 {d6}, [r3,:64], r2
|
|
355 bx lr
|
|
356 .endfunc
|
|
357
|
8492
|
358 function ff_float_to_int16_neon, export=1
|
|
359 subs r2, r2, #8
|
|
360 vld1.64 {d0-d1}, [r1,:128]!
|
|
361 vcvt.s32.f32 q8, q0, #16
|
|
362 vld1.64 {d2-d3}, [r1,:128]!
|
|
363 vcvt.s32.f32 q9, q1, #16
|
|
364 beq 3f
|
|
365 bics ip, r2, #15
|
|
366 beq 2f
|
|
367 1: subs ip, ip, #16
|
|
368 vshrn.s32 d4, q8, #16
|
|
369 vld1.64 {d0-d1}, [r1,:128]!
|
|
370 vcvt.s32.f32 q0, q0, #16
|
|
371 vshrn.s32 d5, q9, #16
|
|
372 vld1.64 {d2-d3}, [r1,:128]!
|
|
373 vcvt.s32.f32 q1, q1, #16
|
|
374 vshrn.s32 d6, q0, #16
|
|
375 vst1.64 {d4-d5}, [r0,:128]!
|
|
376 vshrn.s32 d7, q1, #16
|
|
377 vld1.64 {d16-d17},[r1,:128]!
|
|
378 vcvt.s32.f32 q8, q8, #16
|
|
379 vld1.64 {d18-d19},[r1,:128]!
|
|
380 vcvt.s32.f32 q9, q9, #16
|
|
381 vst1.64 {d6-d7}, [r0,:128]!
|
|
382 bne 1b
|
|
383 ands r2, r2, #15
|
|
384 beq 3f
|
|
385 2: vld1.64 {d0-d1}, [r1,:128]!
|
|
386 vshrn.s32 d4, q8, #16
|
|
387 vcvt.s32.f32 q0, q0, #16
|
|
388 vld1.64 {d2-d3}, [r1,:128]!
|
|
389 vshrn.s32 d5, q9, #16
|
|
390 vcvt.s32.f32 q1, q1, #16
|
|
391 vshrn.s32 d6, q0, #16
|
|
392 vst1.64 {d4-d5}, [r0,:128]!
|
|
393 vshrn.s32 d7, q1, #16
|
|
394 vst1.64 {d6-d7}, [r0,:128]!
|
|
395 bx lr
|
|
396 3: vshrn.s32 d4, q8, #16
|
|
397 vshrn.s32 d5, q9, #16
|
|
398 vst1.64 {d4-d5}, [r0,:128]!
|
|
399 bx lr
|
|
400 .endfunc
|
|
401
|
|
402 function ff_float_to_int16_interleave_neon, export=1
|
|
403 cmp r3, #2
|
|
404 ldrlt r1, [r1]
|
|
405 blt ff_float_to_int16_neon
|
|
406 bne 4f
|
|
407
|
|
408 ldr r3, [r1]
|
|
409 ldr r1, [r1, #4]
|
|
410
|
|
411 subs r2, r2, #8
|
|
412 vld1.64 {d0-d1}, [r3,:128]!
|
|
413 vcvt.s32.f32 q8, q0, #16
|
|
414 vld1.64 {d2-d3}, [r3,:128]!
|
|
415 vcvt.s32.f32 q9, q1, #16
|
|
416 vld1.64 {d20-d21},[r1,:128]!
|
|
417 vcvt.s32.f32 q10, q10, #16
|
|
418 vld1.64 {d22-d23},[r1,:128]!
|
|
419 vcvt.s32.f32 q11, q11, #16
|
|
420 beq 3f
|
|
421 bics ip, r2, #15
|
|
422 beq 2f
|
|
423 1: subs ip, ip, #16
|
|
424 vld1.64 {d0-d1}, [r3,:128]!
|
|
425 vcvt.s32.f32 q0, q0, #16
|
|
426 vsri.32 q10, q8, #16
|
|
427 vld1.64 {d2-d3}, [r3,:128]!
|
|
428 vcvt.s32.f32 q1, q1, #16
|
|
429 vld1.64 {d24-d25},[r1,:128]!
|
|
430 vcvt.s32.f32 q12, q12, #16
|
|
431 vld1.64 {d26-d27},[r1,:128]!
|
|
432 vsri.32 q11, q9, #16
|
|
433 vst1.64 {d20-d21},[r0,:128]!
|
|
434 vcvt.s32.f32 q13, q13, #16
|
|
435 vst1.64 {d22-d23},[r0,:128]!
|
|
436 vsri.32 q12, q0, #16
|
|
437 vld1.64 {d16-d17},[r3,:128]!
|
|
438 vsri.32 q13, q1, #16
|
|
439 vst1.64 {d24-d25},[r0,:128]!
|
|
440 vcvt.s32.f32 q8, q8, #16
|
|
441 vld1.64 {d18-d19},[r3,:128]!
|
|
442 vcvt.s32.f32 q9, q9, #16
|
|
443 vld1.64 {d20-d21},[r1,:128]!
|
|
444 vcvt.s32.f32 q10, q10, #16
|
|
445 vld1.64 {d22-d23},[r1,:128]!
|
|
446 vcvt.s32.f32 q11, q11, #16
|
|
447 vst1.64 {d26-d27},[r0,:128]!
|
|
448 bne 1b
|
|
449 ands r2, r2, #15
|
|
450 beq 3f
|
|
451 2: vsri.32 q10, q8, #16
|
|
452 vld1.64 {d0-d1}, [r3,:128]!
|
|
453 vcvt.s32.f32 q0, q0, #16
|
|
454 vld1.64 {d2-d3}, [r3,:128]!
|
|
455 vcvt.s32.f32 q1, q1, #16
|
|
456 vld1.64 {d24-d25},[r1,:128]!
|
|
457 vcvt.s32.f32 q12, q12, #16
|
|
458 vsri.32 q11, q9, #16
|
|
459 vld1.64 {d26-d27},[r1,:128]!
|
|
460 vcvt.s32.f32 q13, q13, #16
|
|
461 vst1.64 {d20-d21},[r0,:128]!
|
|
462 vsri.32 q12, q0, #16
|
|
463 vst1.64 {d22-d23},[r0,:128]!
|
|
464 vsri.32 q13, q1, #16
|
|
465 vst1.64 {d24-d27},[r0,:128]!
|
|
466 bx lr
|
|
467 3: vsri.32 q10, q8, #16
|
|
468 vsri.32 q11, q9, #16
|
|
469 vst1.64 {d20-d23},[r0,:128]!
|
|
470 bx lr
|
|
471
|
|
472 4: push {r4-r8,lr}
|
|
473 cmp r3, #4
|
|
474 lsl ip, r3, #1
|
|
475 blt 4f
|
|
476
|
|
477 @ 4 channels
|
|
478 5: ldmia r1!, {r4-r7}
|
|
479 mov lr, r2
|
|
480 mov r8, r0
|
|
481 vld1.64 {d16-d17},[r4,:128]!
|
|
482 vcvt.s32.f32 q8, q8, #16
|
|
483 vld1.64 {d18-d19},[r5,:128]!
|
|
484 vcvt.s32.f32 q9, q9, #16
|
|
485 vld1.64 {d20-d21},[r6,:128]!
|
|
486 vcvt.s32.f32 q10, q10, #16
|
|
487 vld1.64 {d22-d23},[r7,:128]!
|
|
488 vcvt.s32.f32 q11, q11, #16
|
|
489 6: subs lr, lr, #8
|
|
490 vld1.64 {d0-d1}, [r4,:128]!
|
|
491 vcvt.s32.f32 q0, q0, #16
|
|
492 vsri.32 q9, q8, #16
|
|
493 vld1.64 {d2-d3}, [r5,:128]!
|
|
494 vcvt.s32.f32 q1, q1, #16
|
|
495 vsri.32 q11, q10, #16
|
|
496 vld1.64 {d4-d5}, [r6,:128]!
|
|
497 vcvt.s32.f32 q2, q2, #16
|
|
498 vzip.32 d18, d22
|
|
499 vld1.64 {d6-d7}, [r7,:128]!
|
|
500 vcvt.s32.f32 q3, q3, #16
|
|
501 vzip.32 d19, d23
|
|
502 vst1.64 {d18}, [r8], ip
|
|
503 vsri.32 q1, q0, #16
|
|
504 vst1.64 {d22}, [r8], ip
|
|
505 vsri.32 q3, q2, #16
|
|
506 vst1.64 {d19}, [r8], ip
|
|
507 vzip.32 d2, d6
|
|
508 vst1.64 {d23}, [r8], ip
|
|
509 vzip.32 d3, d7
|
|
510 beq 7f
|
|
511 vld1.64 {d16-d17},[r4,:128]!
|
|
512 vcvt.s32.f32 q8, q8, #16
|
|
513 vst1.64 {d2}, [r8], ip
|
|
514 vld1.64 {d18-d19},[r5,:128]!
|
|
515 vcvt.s32.f32 q9, q9, #16
|
|
516 vst1.64 {d6}, [r8], ip
|
|
517 vld1.64 {d20-d21},[r6,:128]!
|
|
518 vcvt.s32.f32 q10, q10, #16
|
|
519 vst1.64 {d3}, [r8], ip
|
|
520 vld1.64 {d22-d23},[r7,:128]!
|
|
521 vcvt.s32.f32 q11, q11, #16
|
|
522 vst1.64 {d7}, [r8], ip
|
|
523 b 6b
|
|
524 7: vst1.64 {d2}, [r8], ip
|
|
525 vst1.64 {d6}, [r8], ip
|
|
526 vst1.64 {d3}, [r8], ip
|
|
527 vst1.64 {d7}, [r8], ip
|
|
528 subs r3, r3, #4
|
|
529 popeq {r4-r8,pc}
|
|
530 cmp r3, #4
|
|
531 add r0, r0, #8
|
|
532 bge 5b
|
|
533
|
|
534 @ 2 channels
|
|
535 4: cmp r3, #2
|
|
536 blt 4f
|
|
537 ldmia r1!, {r4-r5}
|
|
538 mov lr, r2
|
|
539 mov r8, r0
|
|
540 tst lr, #8
|
|
541 vld1.64 {d16-d17},[r4,:128]!
|
|
542 vcvt.s32.f32 q8, q8, #16
|
|
543 vld1.64 {d18-d19},[r5,:128]!
|
|
544 vcvt.s32.f32 q9, q9, #16
|
|
545 vld1.64 {d20-d21},[r4,:128]!
|
|
546 vcvt.s32.f32 q10, q10, #16
|
|
547 vld1.64 {d22-d23},[r5,:128]!
|
|
548 vcvt.s32.f32 q11, q11, #16
|
|
549 beq 6f
|
|
550 subs lr, lr, #8
|
|
551 beq 7f
|
|
552 vsri.32 d18, d16, #16
|
|
553 vsri.32 d19, d17, #16
|
|
554 vld1.64 {d16-d17},[r4,:128]!
|
|
555 vcvt.s32.f32 q8, q8, #16
|
|
556 vst1.32 {d18[0]}, [r8], ip
|
|
557 vsri.32 d22, d20, #16
|
|
558 vst1.32 {d18[1]}, [r8], ip
|
|
559 vsri.32 d23, d21, #16
|
|
560 vst1.32 {d19[0]}, [r8], ip
|
|
561 vst1.32 {d19[1]}, [r8], ip
|
|
562 vld1.64 {d18-d19},[r5,:128]!
|
|
563 vcvt.s32.f32 q9, q9, #16
|
|
564 vst1.32 {d22[0]}, [r8], ip
|
|
565 vst1.32 {d22[1]}, [r8], ip
|
|
566 vld1.64 {d20-d21},[r4,:128]!
|
|
567 vcvt.s32.f32 q10, q10, #16
|
|
568 vst1.32 {d23[0]}, [r8], ip
|
|
569 vst1.32 {d23[1]}, [r8], ip
|
|
570 vld1.64 {d22-d23},[r5,:128]!
|
|
571 vcvt.s32.f32 q11, q11, #16
|
|
572 6: subs lr, lr, #16
|
|
573 vld1.64 {d0-d1}, [r4,:128]!
|
|
574 vcvt.s32.f32 q0, q0, #16
|
|
575 vsri.32 d18, d16, #16
|
|
576 vld1.64 {d2-d3}, [r5,:128]!
|
|
577 vcvt.s32.f32 q1, q1, #16
|
|
578 vsri.32 d19, d17, #16
|
|
579 vld1.64 {d4-d5}, [r4,:128]!
|
|
580 vcvt.s32.f32 q2, q2, #16
|
|
581 vld1.64 {d6-d7}, [r5,:128]!
|
|
582 vcvt.s32.f32 q3, q3, #16
|
|
583 vst1.32 {d18[0]}, [r8], ip
|
|
584 vsri.32 d22, d20, #16
|
|
585 vst1.32 {d18[1]}, [r8], ip
|
|
586 vsri.32 d23, d21, #16
|
|
587 vst1.32 {d19[0]}, [r8], ip
|
|
588 vsri.32 d2, d0, #16
|
|
589 vst1.32 {d19[1]}, [r8], ip
|
|
590 vsri.32 d3, d1, #16
|
|
591 vst1.32 {d22[0]}, [r8], ip
|
|
592 vsri.32 d6, d4, #16
|
|
593 vst1.32 {d22[1]}, [r8], ip
|
|
594 vsri.32 d7, d5, #16
|
|
595 vst1.32 {d23[0]}, [r8], ip
|
|
596 vst1.32 {d23[1]}, [r8], ip
|
|
597 beq 6f
|
|
598 vld1.64 {d16-d17},[r4,:128]!
|
|
599 vcvt.s32.f32 q8, q8, #16
|
|
600 vst1.32 {d2[0]}, [r8], ip
|
|
601 vst1.32 {d2[1]}, [r8], ip
|
|
602 vld1.64 {d18-d19},[r5,:128]!
|
|
603 vcvt.s32.f32 q9, q9, #16
|
|
604 vst1.32 {d3[0]}, [r8], ip
|
|
605 vst1.32 {d3[1]}, [r8], ip
|
|
606 vld1.64 {d20-d21},[r4,:128]!
|
|
607 vcvt.s32.f32 q10, q10, #16
|
|
608 vst1.32 {d6[0]}, [r8], ip
|
|
609 vst1.32 {d6[1]}, [r8], ip
|
|
610 vld1.64 {d22-d23},[r5,:128]!
|
|
611 vcvt.s32.f32 q11, q11, #16
|
|
612 vst1.32 {d7[0]}, [r8], ip
|
|
613 vst1.32 {d7[1]}, [r8], ip
|
|
614 bgt 6b
|
|
615 6: vst1.32 {d2[0]}, [r8], ip
|
|
616 vst1.32 {d2[1]}, [r8], ip
|
|
617 vst1.32 {d3[0]}, [r8], ip
|
|
618 vst1.32 {d3[1]}, [r8], ip
|
|
619 vst1.32 {d6[0]}, [r8], ip
|
|
620 vst1.32 {d6[1]}, [r8], ip
|
|
621 vst1.32 {d7[0]}, [r8], ip
|
|
622 vst1.32 {d7[1]}, [r8], ip
|
|
623 b 8f
|
|
624 7: vsri.32 d18, d16, #16
|
|
625 vsri.32 d19, d17, #16
|
|
626 vst1.32 {d18[0]}, [r8], ip
|
|
627 vsri.32 d22, d20, #16
|
|
628 vst1.32 {d18[1]}, [r8], ip
|
|
629 vsri.32 d23, d21, #16
|
|
630 vst1.32 {d19[0]}, [r8], ip
|
|
631 vst1.32 {d19[1]}, [r8], ip
|
|
632 vst1.32 {d22[0]}, [r8], ip
|
|
633 vst1.32 {d22[1]}, [r8], ip
|
|
634 vst1.32 {d23[0]}, [r8], ip
|
|
635 vst1.32 {d23[1]}, [r8], ip
|
|
636 8: subs r3, r3, #2
|
|
637 add r0, r0, #4
|
|
638 popeq {r4-r8,pc}
|
|
639
|
|
640 @ 1 channel
|
|
641 4: ldr r4, [r1],#4
|
|
642 tst r2, #8
|
|
643 mov lr, r2
|
|
644 mov r5, r0
|
|
645 vld1.64 {d0-d1}, [r4,:128]!
|
|
646 vcvt.s32.f32 q0, q0, #16
|
|
647 vld1.64 {d2-d3}, [r4,:128]!
|
|
648 vcvt.s32.f32 q1, q1, #16
|
|
649 bne 8f
|
|
650 6: subs lr, lr, #16
|
|
651 vld1.64 {d4-d5}, [r4,:128]!
|
|
652 vcvt.s32.f32 q2, q2, #16
|
|
653 vld1.64 {d6-d7}, [r4,:128]!
|
|
654 vcvt.s32.f32 q3, q3, #16
|
|
655 vst1.16 {d0[1]}, [r5,:16], ip
|
|
656 vst1.16 {d0[3]}, [r5,:16], ip
|
|
657 vst1.16 {d1[1]}, [r5,:16], ip
|
|
658 vst1.16 {d1[3]}, [r5,:16], ip
|
|
659 vst1.16 {d2[1]}, [r5,:16], ip
|
|
660 vst1.16 {d2[3]}, [r5,:16], ip
|
|
661 vst1.16 {d3[1]}, [r5,:16], ip
|
|
662 vst1.16 {d3[3]}, [r5,:16], ip
|
|
663 beq 7f
|
|
664 vld1.64 {d0-d1}, [r4,:128]!
|
|
665 vcvt.s32.f32 q0, q0, #16
|
|
666 vld1.64 {d2-d3}, [r4,:128]!
|
|
667 vcvt.s32.f32 q1, q1, #16
|
|
668 7: vst1.16 {d4[1]}, [r5,:16], ip
|
|
669 vst1.16 {d4[3]}, [r5,:16], ip
|
|
670 vst1.16 {d5[1]}, [r5,:16], ip
|
|
671 vst1.16 {d5[3]}, [r5,:16], ip
|
|
672 vst1.16 {d6[1]}, [r5,:16], ip
|
|
673 vst1.16 {d6[3]}, [r5,:16], ip
|
|
674 vst1.16 {d7[1]}, [r5,:16], ip
|
|
675 vst1.16 {d7[3]}, [r5,:16], ip
|
|
676 bgt 6b
|
|
677 pop {r4-r8,pc}
|
|
678 8: subs lr, lr, #8
|
|
679 vst1.16 {d0[1]}, [r5,:16], ip
|
|
680 vst1.16 {d0[3]}, [r5,:16], ip
|
|
681 vst1.16 {d1[1]}, [r5,:16], ip
|
|
682 vst1.16 {d1[3]}, [r5,:16], ip
|
|
683 vst1.16 {d2[1]}, [r5,:16], ip
|
|
684 vst1.16 {d2[3]}, [r5,:16], ip
|
|
685 vst1.16 {d3[1]}, [r5,:16], ip
|
|
686 vst1.16 {d3[3]}, [r5,:16], ip
|
|
687 popeq {r4-r8,pc}
|
|
688 vld1.64 {d0-d1}, [r4,:128]!
|
|
689 vcvt.s32.f32 q0, q0, #16
|
|
690 vld1.64 {d2-d3}, [r4,:128]!
|
|
691 vcvt.s32.f32 q1, q1, #16
|
|
692 b 6b
|
|
693 .endfunc
|
8697
|
694
|
|
695 function ff_vector_fmul_neon, export=1
|
|
696 mov r3, r0
|
|
697 subs r2, r2, #8
|
|
698 vld1.64 {d0-d3}, [r0,:128]!
|
|
699 vld1.64 {d4-d7}, [r1,:128]!
|
|
700 vmul.f32 q8, q0, q2
|
|
701 vmul.f32 q9, q1, q3
|
|
702 beq 3f
|
|
703 bics ip, r2, #15
|
|
704 beq 2f
|
|
705 1: subs ip, ip, #16
|
|
706 vld1.64 {d0-d1}, [r0,:128]!
|
|
707 vld1.64 {d4-d5}, [r1,:128]!
|
|
708 vmul.f32 q10, q0, q2
|
|
709 vld1.64 {d2-d3}, [r0,:128]!
|
|
710 vld1.64 {d6-d7}, [r1,:128]!
|
|
711 vmul.f32 q11, q1, q3
|
|
712 vst1.64 {d16-d19},[r3,:128]!
|
|
713 vld1.64 {d0-d1}, [r0,:128]!
|
|
714 vld1.64 {d4-d5}, [r1,:128]!
|
|
715 vmul.f32 q8, q0, q2
|
|
716 vld1.64 {d2-d3}, [r0,:128]!
|
|
717 vld1.64 {d6-d7}, [r1,:128]!
|
|
718 vmul.f32 q9, q1, q3
|
|
719 vst1.64 {d20-d23},[r3,:128]!
|
|
720 bne 1b
|
|
721 ands r2, r2, #15
|
|
722 beq 3f
|
|
723 2: vld1.64 {d0-d1}, [r0,:128]!
|
|
724 vld1.64 {d4-d5}, [r1,:128]!
|
|
725 vst1.64 {d16-d17},[r3,:128]!
|
|
726 vmul.f32 q8, q0, q2
|
|
727 vld1.64 {d2-d3}, [r0,:128]!
|
|
728 vld1.64 {d6-d7}, [r1,:128]!
|
|
729 vst1.64 {d18-d19},[r3,:128]!
|
|
730 vmul.f32 q9, q1, q3
|
|
731 3: vst1.64 {d16-d19},[r3,:128]!
|
|
732 bx lr
|
|
733 .endfunc
|
8698
|
734
|
|
735 function ff_vector_fmul_window_neon, export=1
|
|
736 vld1.32 {d16[],d17[]}, [sp,:32]
|
|
737 push {r4,r5,lr}
|
|
738 ldr lr, [sp, #16]
|
|
739 sub r2, r2, #8
|
|
740 sub r5, lr, #2
|
|
741 add r2, r2, r5, lsl #2
|
|
742 add r4, r3, r5, lsl #3
|
|
743 add ip, r0, r5, lsl #3
|
|
744 mov r5, #-16
|
|
745 vld1.64 {d0,d1}, [r1,:128]!
|
|
746 vld1.64 {d2,d3}, [r2,:128], r5
|
|
747 vld1.64 {d4,d5}, [r3,:128]!
|
|
748 vld1.64 {d6,d7}, [r4,:128], r5
|
|
749 1: subs lr, lr, #4
|
|
750 vmov q11, q8
|
|
751 vmla.f32 d22, d0, d4
|
|
752 vmov q10, q8
|
|
753 vmla.f32 d23, d1, d5
|
|
754 vrev64.32 q3, q3
|
|
755 vmla.f32 d20, d0, d7
|
|
756 vrev64.32 q1, q1
|
|
757 vmla.f32 d21, d1, d6
|
|
758 beq 2f
|
|
759 vmla.f32 d22, d3, d7
|
|
760 vld1.64 {d0,d1}, [r1,:128]!
|
|
761 vmla.f32 d23, d2, d6
|
|
762 vld1.64 {d18,d19},[r2,:128], r5
|
|
763 vmls.f32 d20, d3, d4
|
|
764 vld1.64 {d24,d25},[r3,:128]!
|
|
765 vmls.f32 d21, d2, d5
|
|
766 vld1.64 {d6,d7}, [r4,:128], r5
|
|
767 vmov q1, q9
|
|
768 vrev64.32 q11, q11
|
|
769 vmov q2, q12
|
|
770 vswp d22, d23
|
|
771 vst1.64 {d20,d21},[r0,:128]!
|
|
772 vst1.64 {d22,d23},[ip,:128], r5
|
|
773 b 1b
|
|
774 2: vmla.f32 d22, d3, d7
|
|
775 vmla.f32 d23, d2, d6
|
|
776 vmls.f32 d20, d3, d4
|
|
777 vmls.f32 d21, d2, d5
|
|
778 vrev64.32 q11, q11
|
|
779 vswp d22, d23
|
|
780 vst1.64 {d20,d21},[r0,:128]!
|
|
781 vst1.64 {d22,d23},[ip,:128], r5
|
|
782 pop {r4,r5,pc}
|
|
783 .endfunc
|