comparison arm/h264dsp_neon.S @ 8359:9281a8a9387a libavcodec

ARM: replace "armv4l" with "arm"
author mru
date Wed, 17 Dec 2008 00:54:54 +0000
parents armv4l/h264dsp_neon.S@b294a0d5bc50
children 8d425ee85ddb
comparison
equal deleted inserted replaced
8358:c30b92cf446b 8359:9281a8a9387a
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .fpu neon
24
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26 vtrn.32 \r0, \r4
27 vtrn.32 \r1, \r5
28 vtrn.32 \r2, \r6
29 vtrn.32 \r3, \r7
30 vtrn.16 \r0, \r2
31 vtrn.16 \r1, \r3
32 vtrn.16 \r4, \r6
33 vtrn.16 \r5, \r7
34 vtrn.8 \r0, \r1
35 vtrn.8 \r2, \r3
36 vtrn.8 \r4, \r5
37 vtrn.8 \r6, \r7
38 .endm
39
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41 vswp \r0, \r4
42 vswp \r1, \r5
43 vswp \r2, \r6
44 vswp \r3, \r7
45 .endm
46
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48 vtrn.32 \r0, \r2
49 vtrn.32 \r1, \r3
50 vtrn.32 \r4, \r6
51 vtrn.32 \r5, \r7
52 vtrn.16 \r0, \r1
53 vtrn.16 \r2, \r3
54 vtrn.16 \r4, \r5
55 vtrn.16 \r6, \r7
56 .endm
57
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59 .macro h264_chroma_mc8 avg=0
60 push {r4-r7, lr}
61 ldrd r4, [sp, #20]
62 .if \avg
63 mov lr, r0
64 .endif
65 pld [r1]
66 pld [r1, r2]
67
68 muls r7, r4, r5
69 rsb r6, r7, r5, lsl #3
70 rsb ip, r7, r4, lsl #3
71 sub r4, r7, r4, lsl #3
72 sub r4, r4, r5, lsl #3
73 add r4, r4, #64
74
75 beq 2f
76
77 add r5, r1, r2
78
79 vdup.8 d0, r4
80 lsl r4, r2, #1
81 vdup.8 d1, ip
82 vld1.64 {d4, d5}, [r1], r4
83 vdup.8 d2, r6
84 vld1.64 {d6, d7}, [r5], r4
85 vdup.8 d3, r7
86
87 vext.8 d5, d4, d5, #1
88 vext.8 d7, d6, d7, #1
89
90 1: pld [r5]
91 vmull.u8 q8, d4, d0
92 vmlal.u8 q8, d5, d1
93 vld1.64 {d4, d5}, [r1], r4
94 vmlal.u8 q8, d6, d2
95 vext.8 d5, d4, d5, #1
96 vmlal.u8 q8, d7, d3
97 vmull.u8 q9, d6, d0
98 subs r3, r3, #2
99 vmlal.u8 q9, d7, d1
100 vmlal.u8 q9, d4, d2
101 vmlal.u8 q9, d5, d3
102 vrshrn.u16 d16, q8, #6
103 vld1.64 {d6, d7}, [r5], r4
104 pld [r1]
105 vrshrn.u16 d17, q9, #6
106 .if \avg
107 vld1.64 {d20}, [lr,:64], r2
108 vld1.64 {d21}, [lr,:64], r2
109 vrhadd.u8 q8, q8, q10
110 .endif
111 vext.8 d7, d6, d7, #1
112 vst1.64 {d16}, [r0,:64], r2
113 vst1.64 {d17}, [r0,:64], r2
114 bgt 1b
115
116 pop {r4-r7, pc}
117
118 2: tst r6, r6
119 add ip, ip, r6
120 vdup.8 d0, r4
121 vdup.8 d1, ip
122
123 beq 4f
124
125 add r5, r1, r2
126 lsl r4, r2, #1
127 vld1.64 {d4}, [r1], r4
128 vld1.64 {d6}, [r5], r4
129
130 3: pld [r5]
131 vmull.u8 q8, d4, d0
132 vmlal.u8 q8, d6, d1
133 vld1.64 {d4}, [r1], r4
134 vmull.u8 q9, d6, d0
135 vmlal.u8 q9, d4, d1
136 vld1.64 {d6}, [r5], r4
137 vrshrn.u16 d16, q8, #6
138 vrshrn.u16 d17, q9, #6
139 .if \avg
140 vld1.64 {d20}, [lr,:64], r2
141 vld1.64 {d21}, [lr,:64], r2
142 vrhadd.u8 q8, q8, q10
143 .endif
144 subs r3, r3, #2
145 pld [r1]
146 vst1.64 {d16}, [r0,:64], r2
147 vst1.64 {d17}, [r0,:64], r2
148 bgt 3b
149
150 pop {r4-r7, pc}
151
152 4: vld1.64 {d4, d5}, [r1], r2
153 vld1.64 {d6, d7}, [r1], r2
154 vext.8 d5, d4, d5, #1
155 vext.8 d7, d6, d7, #1
156
157 5: pld [r1]
158 subs r3, r3, #2
159 vmull.u8 q8, d4, d0
160 vmlal.u8 q8, d5, d1
161 vld1.64 {d4, d5}, [r1], r2
162 vmull.u8 q9, d6, d0
163 vmlal.u8 q9, d7, d1
164 pld [r1]
165 vext.8 d5, d4, d5, #1
166 vrshrn.u16 d16, q8, #6
167 vrshrn.u16 d17, q9, #6
168 .if \avg
169 vld1.64 {d20}, [lr,:64], r2
170 vld1.64 {d21}, [lr,:64], r2
171 vrhadd.u8 q8, q8, q10
172 .endif
173 vld1.64 {d6, d7}, [r1], r2
174 vext.8 d7, d6, d7, #1
175 vst1.64 {d16}, [r0,:64], r2
176 vst1.64 {d17}, [r0,:64], r2
177 bgt 5b
178
179 pop {r4-r7, pc}
180 .endm
181
182 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
183 .macro h264_chroma_mc4 avg=0
184 push {r4-r7, lr}
185 ldrd r4, [sp, #20]
186 .if \avg
187 mov lr, r0
188 .endif
189 pld [r1]
190 pld [r1, r2]
191
192 muls r7, r4, r5
193 rsb r6, r7, r5, lsl #3
194 rsb ip, r7, r4, lsl #3
195 sub r4, r7, r4, lsl #3
196 sub r4, r4, r5, lsl #3
197 add r4, r4, #64
198
199 beq 2f
200
201 add r5, r1, r2
202
203 vdup.8 d0, r4
204 lsl r4, r2, #1
205 vdup.8 d1, ip
206 vld1.64 {d4}, [r1], r4
207 vdup.8 d2, r6
208 vld1.64 {d6}, [r5], r4
209 vdup.8 d3, r7
210
211 vext.8 d5, d4, d5, #1
212 vext.8 d7, d6, d7, #1
213 vtrn.32 d4, d5
214 vtrn.32 d6, d7
215
216 vtrn.32 d0, d1
217 vtrn.32 d2, d3
218
219 1: pld [r5]
220 vmull.u8 q8, d4, d0
221 vmlal.u8 q8, d6, d2
222 vld1.64 {d4}, [r1], r4
223 vext.8 d5, d4, d5, #1
224 vtrn.32 d4, d5
225 vmull.u8 q9, d6, d0
226 vmlal.u8 q9, d4, d2
227 vld1.64 {d6}, [r5], r4
228 vadd.i16 d16, d16, d17
229 vadd.i16 d17, d18, d19
230 vrshrn.u16 d16, q8, #6
231 subs r3, r3, #2
232 pld [r1]
233 .if \avg
234 vld1.32 {d20[0]}, [lr,:32], r2
235 vld1.32 {d20[1]}, [lr,:32], r2
236 vrhadd.u8 d16, d16, d20
237 .endif
238 vext.8 d7, d6, d7, #1
239 vtrn.32 d6, d7
240 vst1.32 {d16[0]}, [r0,:32], r2
241 vst1.32 {d16[1]}, [r0,:32], r2
242 bgt 1b
243
244 pop {r4-r7, pc}
245
246 2: tst r6, r6
247 add ip, ip, r6
248 vdup.8 d0, r4
249 vdup.8 d1, ip
250 vtrn.32 d0, d1
251
252 beq 4f
253
254 vext.32 d1, d0, d1, #1
255 add r5, r1, r2
256 lsl r4, r2, #1
257 vld1.32 {d4[0]}, [r1], r4
258 vld1.32 {d4[1]}, [r5], r4
259
260 3: pld [r5]
261 vmull.u8 q8, d4, d0
262 vld1.32 {d4[0]}, [r1], r4
263 vmull.u8 q9, d4, d1
264 vld1.32 {d4[1]}, [r5], r4
265 vadd.i16 d16, d16, d17
266 vadd.i16 d17, d18, d19
267 vrshrn.u16 d16, q8, #6
268 .if \avg
269 vld1.32 {d20[0]}, [lr,:32], r2
270 vld1.32 {d20[1]}, [lr,:32], r2
271 vrhadd.u8 d16, d16, d20
272 .endif
273 subs r3, r3, #2
274 pld [r1]
275 vst1.32 {d16[0]}, [r0,:32], r2
276 vst1.32 {d16[1]}, [r0,:32], r2
277 bgt 3b
278
279 pop {r4-r7, pc}
280
281 4: vld1.64 {d4}, [r1], r2
282 vld1.64 {d6}, [r1], r2
283 vext.8 d5, d4, d5, #1
284 vext.8 d7, d6, d7, #1
285 vtrn.32 d4, d5
286 vtrn.32 d6, d7
287
288 5: vmull.u8 q8, d4, d0
289 vmull.u8 q9, d6, d0
290 subs r3, r3, #2
291 vld1.64 {d4}, [r1], r2
292 vext.8 d5, d4, d5, #1
293 vtrn.32 d4, d5
294 vadd.i16 d16, d16, d17
295 vadd.i16 d17, d18, d19
296 pld [r1]
297 vrshrn.u16 d16, q8, #6
298 .if \avg
299 vld1.32 {d20[0]}, [lr,:32], r2
300 vld1.32 {d20[1]}, [lr,:32], r2
301 vrhadd.u8 d16, d16, d20
302 .endif
303 vld1.64 {d6}, [r1], r2
304 vext.8 d7, d6, d7, #1
305 vtrn.32 d6, d7
306 pld [r1]
307 vst1.32 {d16[0]}, [r0,:32], r2
308 vst1.32 {d16[1]}, [r0,:32], r2
309 bgt 5b
310
311 pop {r4-r7, pc}
312 .endm
313
314 .text
315 .align
316
317 function ff_put_h264_chroma_mc8_neon, export=1
318 h264_chroma_mc8
319 .endfunc
320
321 function ff_avg_h264_chroma_mc8_neon, export=1
322 h264_chroma_mc8 avg=1
323 .endfunc
324
325 function ff_put_h264_chroma_mc4_neon, export=1
326 h264_chroma_mc4
327 .endfunc
328
329 function ff_avg_h264_chroma_mc4_neon, export=1
330 h264_chroma_mc4 avg=1
331 .endfunc
332
333 /* H.264 loop filter */
334
335 .macro h264_loop_filter_start
336 ldr ip, [sp]
337 tst r2, r2
338 ldr ip, [ip]
339 tstne r3, r3
340 vmov.32 d24[0], ip
341 and ip, ip, ip, lsl #16
342 bxeq lr
343 ands ip, ip, ip, lsl #8
344 bxlt lr
345 .endm
346
347 .macro align_push_regs
348 and ip, sp, #15
349 add ip, ip, #32
350 sub sp, sp, ip
351 vst1.64 {d12-d15}, [sp,:128]
352 sub sp, sp, #32
353 vst1.64 {d8-d11}, [sp,:128]
354 .endm
355
356 .macro align_pop_regs
357 vld1.64 {d8-d11}, [sp,:128]!
358 vld1.64 {d12-d15}, [sp,:128], ip
359 .endm
360
361 .macro h264_loop_filter_luma
362 vdup.8 q11, r2 @ alpha
363 vmovl.u8 q12, d24
364 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
365 vmovl.u16 q12, d24
366 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
367 vsli.16 q12, q12, #8
368 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
369 vsli.32 q12, q12, #16
370 vclt.u8 q6, q6, q11 @ < alpha
371 vdup.8 q11, r3 @ beta
372 vclt.s8 q7, q12, #0
373 vclt.u8 q14, q14, q11 @ < beta
374 vclt.u8 q15, q15, q11 @ < beta
375 vbic q6, q6, q7
376 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
377 vand q6, q6, q14
378 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
379 vclt.u8 q4, q4, q11 @ < beta
380 vand q6, q6, q15
381 vclt.u8 q5, q5, q11 @ < beta
382 vand q4, q4, q6
383 vand q5, q5, q6
384 vand q12, q12, q6
385 vrhadd.u8 q14, q8, q0
386 vsub.i8 q6, q12, q4
387 vqadd.u8 q7, q9, q12
388 vhadd.u8 q10, q10, q14
389 vsub.i8 q6, q6, q5
390 vhadd.u8 q14, q2, q14
391 vmin.u8 q7, q7, q10
392 vqsub.u8 q11, q9, q12
393 vqadd.u8 q2, q1, q12
394 vmax.u8 q7, q7, q11
395 vqsub.u8 q11, q1, q12
396 vmin.u8 q14, q2, q14
397 vmovl.u8 q2, d0
398 vmax.u8 q14, q14, q11
399 vmovl.u8 q10, d1
400 vsubw.u8 q2, q2, d16
401 vsubw.u8 q10, q10, d17
402 vshl.i16 q2, q2, #2
403 vshl.i16 q10, q10, #2
404 vaddw.u8 q2, q2, d18
405 vaddw.u8 q10, q10, d19
406 vsubw.u8 q2, q2, d2
407 vsubw.u8 q10, q10, d3
408 vrshrn.i16 d4, q2, #3
409 vrshrn.i16 d5, q10, #3
410 vbsl q4, q7, q9
411 vbsl q5, q14, q1
412 vneg.s8 q7, q6
413 vmovl.u8 q14, d16
414 vmin.s8 q2, q2, q6
415 vmovl.u8 q6, d17
416 vmax.s8 q2, q2, q7
417 vmovl.u8 q11, d0
418 vmovl.u8 q12, d1
419 vaddw.s8 q14, q14, d4
420 vaddw.s8 q6, q6, d5
421 vsubw.s8 q11, q11, d4
422 vsubw.s8 q12, q12, d5
423 vqmovun.s16 d16, q14
424 vqmovun.s16 d17, q6
425 vqmovun.s16 d0, q11
426 vqmovun.s16 d1, q12
427 .endm
428
429 function ff_h264_v_loop_filter_luma_neon, export=1
430 h264_loop_filter_start
431
432 vld1.64 {d0, d1}, [r0,:128], r1
433 vld1.64 {d2, d3}, [r0,:128], r1
434 vld1.64 {d4, d5}, [r0,:128], r1
435 sub r0, r0, r1, lsl #2
436 sub r0, r0, r1, lsl #1
437 vld1.64 {d20,d21}, [r0,:128], r1
438 vld1.64 {d18,d19}, [r0,:128], r1
439 vld1.64 {d16,d17}, [r0,:128], r1
440
441 align_push_regs
442
443 h264_loop_filter_luma
444
445 sub r0, r0, r1, lsl #1
446 vst1.64 {d8, d9}, [r0,:128], r1
447 vst1.64 {d16,d17}, [r0,:128], r1
448 vst1.64 {d0, d1}, [r0,:128], r1
449 vst1.64 {d10,d11}, [r0,:128]
450
451 align_pop_regs
452 bx lr
453 .endfunc
454
455 function ff_h264_h_loop_filter_luma_neon, export=1
456 h264_loop_filter_start
457
458 sub r0, r0, #4
459 vld1.64 {d6}, [r0], r1
460 vld1.64 {d20}, [r0], r1
461 vld1.64 {d18}, [r0], r1
462 vld1.64 {d16}, [r0], r1
463 vld1.64 {d0}, [r0], r1
464 vld1.64 {d2}, [r0], r1
465 vld1.64 {d4}, [r0], r1
466 vld1.64 {d26}, [r0], r1
467 vld1.64 {d7}, [r0], r1
468 vld1.64 {d21}, [r0], r1
469 vld1.64 {d19}, [r0], r1
470 vld1.64 {d17}, [r0], r1
471 vld1.64 {d1}, [r0], r1
472 vld1.64 {d3}, [r0], r1
473 vld1.64 {d5}, [r0], r1
474 vld1.64 {d27}, [r0], r1
475
476 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
477
478 align_push_regs
479 sub sp, sp, #16
480 vst1.64 {d4, d5}, [sp,:128]
481 sub sp, sp, #16
482 vst1.64 {d20,d21}, [sp,:128]
483
484 h264_loop_filter_luma
485
486 vld1.64 {d20,d21}, [sp,:128]!
487 vld1.64 {d4, d5}, [sp,:128]!
488
489 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
490
491 sub r0, r0, r1, lsl #4
492 vst1.64 {d6}, [r0], r1
493 vst1.64 {d20}, [r0], r1
494 vst1.64 {d8}, [r0], r1
495 vst1.64 {d16}, [r0], r1
496 vst1.64 {d0}, [r0], r1
497 vst1.64 {d10}, [r0], r1
498 vst1.64 {d4}, [r0], r1
499 vst1.64 {d26}, [r0], r1
500 vst1.64 {d7}, [r0], r1
501 vst1.64 {d21}, [r0], r1
502 vst1.64 {d9}, [r0], r1
503 vst1.64 {d17}, [r0], r1
504 vst1.64 {d1}, [r0], r1
505 vst1.64 {d11}, [r0], r1
506 vst1.64 {d5}, [r0], r1
507 vst1.64 {d27}, [r0], r1
508
509 align_pop_regs
510 bx lr
511 .endfunc
512
513 .macro h264_loop_filter_chroma
514 vdup.8 d22, r2 @ alpha
515 vmovl.u8 q12, d24
516 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
517 vmovl.u8 q2, d0
518 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
519 vsubw.u8 q2, q2, d16
520 vsli.16 d24, d24, #8
521 vshl.i16 q2, q2, #2
522 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
523 vaddw.u8 q2, q2, d18
524 vclt.u8 d26, d26, d22 @ < alpha
525 vsubw.u8 q2, q2, d2
526 vdup.8 d22, r3 @ beta
527 vclt.s8 d25, d24, #0
528 vrshrn.i16 d4, q2, #3
529 vclt.u8 d28, d28, d22 @ < beta
530 vbic d26, d26, d25
531 vclt.u8 d30, d30, d22 @ < beta
532 vand d26, d26, d28
533 vneg.s8 d25, d24
534 vand d26, d26, d30
535 vmin.s8 d4, d4, d24
536 vmovl.u8 q14, d16
537 vand d4, d4, d26
538 vmax.s8 d4, d4, d25
539 vmovl.u8 q11, d0
540 vaddw.s8 q14, q14, d4
541 vsubw.s8 q11, q11, d4
542 vqmovun.s16 d16, q14
543 vqmovun.s16 d0, q11
544 .endm
545
546 function ff_h264_v_loop_filter_chroma_neon, export=1
547 h264_loop_filter_start
548
549 sub r0, r0, r1, lsl #1
550 vld1.64 {d18}, [r0,:64], r1
551 vld1.64 {d16}, [r0,:64], r1
552 vld1.64 {d0}, [r0,:64], r1
553 vld1.64 {d2}, [r0,:64]
554
555 h264_loop_filter_chroma
556
557 sub r0, r0, r1, lsl #1
558 vst1.64 {d16}, [r0,:64], r1
559 vst1.64 {d0}, [r0,:64], r1
560
561 bx lr
562 .endfunc
563
564 function ff_h264_h_loop_filter_chroma_neon, export=1
565 h264_loop_filter_start
566
567 sub r0, r0, #2
568 vld1.32 {d18[0]}, [r0], r1
569 vld1.32 {d16[0]}, [r0], r1
570 vld1.32 {d0[0]}, [r0], r1
571 vld1.32 {d2[0]}, [r0], r1
572 vld1.32 {d18[1]}, [r0], r1
573 vld1.32 {d16[1]}, [r0], r1
574 vld1.32 {d0[1]}, [r0], r1
575 vld1.32 {d2[1]}, [r0], r1
576
577 vtrn.16 d18, d0
578 vtrn.16 d16, d2
579 vtrn.8 d18, d16
580 vtrn.8 d0, d2
581
582 h264_loop_filter_chroma
583
584 vtrn.16 d18, d0
585 vtrn.16 d16, d2
586 vtrn.8 d18, d16
587 vtrn.8 d0, d2
588
589 sub r0, r0, r1, lsl #3
590 vst1.32 {d18[0]}, [r0], r1
591 vst1.32 {d16[0]}, [r0], r1
592 vst1.32 {d0[0]}, [r0], r1
593 vst1.32 {d2[0]}, [r0], r1
594 vst1.32 {d18[1]}, [r0], r1
595 vst1.32 {d16[1]}, [r0], r1
596 vst1.32 {d0[1]}, [r0], r1
597 vst1.32 {d2[1]}, [r0], r1
598
599 bx lr
600 .endfunc
601
602 /* H.264 qpel MC */
603
604 .macro lowpass_const r
605 movw \r, #5
606 movt \r, #20
607 vmov.32 d6[0], \r
608 .endm
609
610 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
611 .if \narrow
612 t0 .req q0
613 t1 .req q8
614 .else
615 t0 .req \d0
616 t1 .req \d1
617 .endif
618 vext.8 d2, \r0, \r1, #2
619 vext.8 d3, \r0, \r1, #3
620 vaddl.u8 q1, d2, d3
621 vext.8 d4, \r0, \r1, #1
622 vext.8 d5, \r0, \r1, #4
623 vaddl.u8 q2, d4, d5
624 vext.8 d30, \r0, \r1, #5
625 vaddl.u8 t0, \r0, d30
626 vext.8 d18, \r2, \r3, #2
627 vmla.i16 t0, q1, d6[1]
628 vext.8 d19, \r2, \r3, #3
629 vaddl.u8 q9, d18, d19
630 vext.8 d20, \r2, \r3, #1
631 vmls.i16 t0, q2, d6[0]
632 vext.8 d21, \r2, \r3, #4
633 vaddl.u8 q10, d20, d21
634 vext.8 d31, \r2, \r3, #5
635 vaddl.u8 t1, \r2, d31
636 vmla.i16 t1, q9, d6[1]
637 vmls.i16 t1, q10, d6[0]
638 .if \narrow
639 vqrshrun.s16 \d0, t0, #5
640 vqrshrun.s16 \d1, t1, #5
641 .endif
642 .unreq t0
643 .unreq t1
644 .endm
645
646 .macro lowpass_8_1 r0, r1, d0, narrow=1
647 .if \narrow
648 t0 .req q0
649 .else
650 t0 .req \d0
651 .endif
652 vext.8 d2, \r0, \r1, #2
653 vext.8 d3, \r0, \r1, #3
654 vaddl.u8 q1, d2, d3
655 vext.8 d4, \r0, \r1, #1
656 vext.8 d5, \r0, \r1, #4
657 vaddl.u8 q2, d4, d5
658 vext.8 d30, \r0, \r1, #5
659 vaddl.u8 t0, \r0, d30
660 vmla.i16 t0, q1, d6[1]
661 vmls.i16 t0, q2, d6[0]
662 .if \narrow
663 vqrshrun.s16 \d0, t0, #5
664 .endif
665 .unreq t0
666 .endm
667
668 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
669 vext.16 q1, \r0, \r1, #2
670 vext.16 q0, \r0, \r1, #3
671 vaddl.s16 q9, d2, d0
672 vext.16 q2, \r0, \r1, #1
673 vaddl.s16 q1, d3, d1
674 vext.16 q3, \r0, \r1, #4
675 vaddl.s16 q10, d4, d6
676 vext.16 \r1, \r0, \r1, #5
677 vaddl.s16 q2, d5, d7
678 vaddl.s16 q0, \h0, \h1
679 vaddl.s16 q8, \l0, \l1
680
681 vshl.i32 q3, q9, #4
682 vshl.i32 q9, q9, #2
683 vshl.i32 q15, q10, #2
684 vadd.i32 q9, q9, q3
685 vadd.i32 q10, q10, q15
686
687 vshl.i32 q3, q1, #4
688 vshl.i32 q1, q1, #2
689 vshl.i32 q15, q2, #2
690 vadd.i32 q1, q1, q3
691 vadd.i32 q2, q2, q15
692
693 vadd.i32 q9, q9, q8
694 vsub.i32 q9, q9, q10
695
696 vadd.i32 q1, q1, q0
697 vsub.i32 q1, q1, q2
698
699 vrshrn.s32 d18, q9, #10
700 vrshrn.s32 d19, q1, #10
701
702 vqmovun.s16 \d, q9
703 .endm
704
705 function put_h264_qpel16_h_lowpass_neon_packed
706 mov r4, lr
707 mov ip, #16
708 mov r3, #8
709 bl put_h264_qpel8_h_lowpass_neon
710 sub r1, r1, r2, lsl #4
711 add r1, r1, #8
712 mov ip, #16
713 mov lr, r4
714 b put_h264_qpel8_h_lowpass_neon
715 .endfunc
716
717 function put_h264_qpel16_h_lowpass_neon
718 push {lr}
719 mov ip, #16
720 bl put_h264_qpel8_h_lowpass_neon
721 sub r0, r0, r3, lsl #4
722 sub r1, r1, r2, lsl #4
723 add r0, r0, #8
724 add r1, r1, #8
725 mov ip, #16
726 pop {lr}
727 .endfunc
728
729 function put_h264_qpel8_h_lowpass_neon
730 1: vld1.64 {d0, d1}, [r1], r2
731 vld1.64 {d16,d17}, [r1], r2
732 subs ip, ip, #2
733 lowpass_8 d0, d1, d16, d17, d0, d16
734 vst1.64 {d0}, [r0,:64], r3
735 vst1.64 {d16}, [r0,:64], r3
736 bne 1b
737 bx lr
738 .endfunc
739
740 function put_h264_qpel16_h_lowpass_l2_neon
741 push {lr}
742 mov ip, #16
743 bl put_h264_qpel8_h_lowpass_l2_neon
744 sub r0, r0, r2, lsl #4
745 sub r1, r1, r2, lsl #4
746 sub r3, r3, r2, lsl #4
747 add r0, r0, #8
748 add r1, r1, #8
749 add r3, r3, #8
750 mov ip, #16
751 pop {lr}
752 .endfunc
753
754 function put_h264_qpel8_h_lowpass_l2_neon
755 1: vld1.64 {d0, d1}, [r1], r2
756 vld1.64 {d16,d17}, [r1], r2
757 vld1.64 {d28}, [r3], r2
758 vld1.64 {d29}, [r3], r2
759 subs ip, ip, #2
760 lowpass_8 d0, d1, d16, d17, d0, d1
761 vrhadd.u8 q0, q0, q14
762 vst1.64 {d0}, [r0,:64], r2
763 vst1.64 {d1}, [r0,:64], r2
764 bne 1b
765 bx lr
766 .endfunc
767
768 function put_h264_qpel16_v_lowpass_neon_packed
769 mov r4, lr
770 mov r2, #8
771 bl put_h264_qpel8_v_lowpass_neon
772 sub r1, r1, r3, lsl #2
773 bl put_h264_qpel8_v_lowpass_neon
774 sub r1, r1, r3, lsl #4
775 sub r1, r1, r3, lsl #2
776 add r1, r1, #8
777 bl put_h264_qpel8_v_lowpass_neon
778 sub r1, r1, r3, lsl #2
779 mov lr, r4
780 b put_h264_qpel8_v_lowpass_neon
781 .endfunc
782
783 function put_h264_qpel16_v_lowpass_neon
784 mov r4, lr
785 bl put_h264_qpel8_v_lowpass_neon
786 sub r1, r1, r3, lsl #2
787 bl put_h264_qpel8_v_lowpass_neon
788 sub r0, r0, r2, lsl #4
789 add r0, r0, #8
790 sub r1, r1, r3, lsl #4
791 sub r1, r1, r3, lsl #2
792 add r1, r1, #8
793 bl put_h264_qpel8_v_lowpass_neon
794 sub r1, r1, r3, lsl #2
795 mov lr, r4
796 .endfunc
797
798 function put_h264_qpel8_v_lowpass_neon
799 vld1.64 {d8}, [r1], r3
800 vld1.64 {d10}, [r1], r3
801 vld1.64 {d12}, [r1], r3
802 vld1.64 {d14}, [r1], r3
803 vld1.64 {d22}, [r1], r3
804 vld1.64 {d24}, [r1], r3
805 vld1.64 {d26}, [r1], r3
806 vld1.64 {d28}, [r1], r3
807 vld1.64 {d9}, [r1], r3
808 vld1.64 {d11}, [r1], r3
809 vld1.64 {d13}, [r1], r3
810 vld1.64 {d15}, [r1], r3
811 vld1.64 {d23}, [r1]
812
813 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
814 lowpass_8 d8, d9, d10, d11, d8, d10
815 lowpass_8 d12, d13, d14, d15, d12, d14
816 lowpass_8 d22, d23, d24, d25, d22, d24
817 lowpass_8 d26, d27, d28, d29, d26, d28
818 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
819
820 vst1.64 {d8}, [r0,:64], r2
821 vst1.64 {d10}, [r0,:64], r2
822 vst1.64 {d12}, [r0,:64], r2
823 vst1.64 {d14}, [r0,:64], r2
824 vst1.64 {d22}, [r0,:64], r2
825 vst1.64 {d24}, [r0,:64], r2
826 vst1.64 {d26}, [r0,:64], r2
827 vst1.64 {d28}, [r0,:64], r2
828
829 bx lr
830 .endfunc
831
832 function put_h264_qpel16_v_lowpass_l2_neon
833 mov r4, lr
834 bl put_h264_qpel8_v_lowpass_l2_neon
835 sub r1, r1, r3, lsl #2
836 bl put_h264_qpel8_v_lowpass_l2_neon
837 sub r0, r0, r3, lsl #4
838 sub ip, ip, r2, lsl #4
839 add r0, r0, #8
840 add ip, ip, #8
841 sub r1, r1, r3, lsl #4
842 sub r1, r1, r3, lsl #2
843 add r1, r1, #8
844 bl put_h264_qpel8_v_lowpass_l2_neon
845 sub r1, r1, r3, lsl #2
846 mov lr, r4
847 .endfunc
848
849 function put_h264_qpel8_v_lowpass_l2_neon
850 vld1.64 {d8}, [r1], r3
851 vld1.64 {d10}, [r1], r3
852 vld1.64 {d12}, [r1], r3
853 vld1.64 {d14}, [r1], r3
854 vld1.64 {d22}, [r1], r3
855 vld1.64 {d24}, [r1], r3
856 vld1.64 {d26}, [r1], r3
857 vld1.64 {d28}, [r1], r3
858 vld1.64 {d9}, [r1], r3
859 vld1.64 {d11}, [r1], r3
860 vld1.64 {d13}, [r1], r3
861 vld1.64 {d15}, [r1], r3
862 vld1.64 {d23}, [r1]
863
864 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
865 lowpass_8 d8, d9, d10, d11, d8, d9
866 lowpass_8 d12, d13, d14, d15, d12, d13
867 lowpass_8 d22, d23, d24, d25, d22, d23
868 lowpass_8 d26, d27, d28, d29, d26, d27
869 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
870
871 vld1.64 {d0}, [ip], r2
872 vld1.64 {d1}, [ip], r2
873 vld1.64 {d2}, [ip], r2
874 vld1.64 {d3}, [ip], r2
875 vld1.64 {d4}, [ip], r2
876 vrhadd.u8 q0, q0, q4
877 vld1.64 {d5}, [ip], r2
878 vrhadd.u8 q1, q1, q6
879 vld1.64 {d10}, [ip], r2
880 vrhadd.u8 q2, q2, q11
881 vld1.64 {d11}, [ip], r2
882
883 vst1.64 {d0}, [r0,:64], r3
884 vst1.64 {d1}, [r0,:64], r3
885 vrhadd.u8 q5, q5, q13
886 vst1.64 {d2}, [r0,:64], r3
887 vst1.64 {d3}, [r0,:64], r3
888 vst1.64 {d4}, [r0,:64], r3
889 vst1.64 {d5}, [r0,:64], r3
890 vst1.64 {d10}, [r0,:64], r3
891 vst1.64 {d11}, [r0,:64], r3
892
893 bx lr
894 .endfunc
895
896 function put_h264_qpel8_hv_lowpass_neon_top
897 lowpass_const ip
898 mov ip, #12
899 1: vld1.64 {d0, d1}, [r1], r3
900 vld1.64 {d16,d17}, [r1], r3
901 subs ip, ip, #2
902 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
903 vst1.64 {d22-d25}, [r4,:128]!
904 bne 1b
905
906 vld1.64 {d0, d1}, [r1]
907 lowpass_8_1 d0, d1, q12, narrow=0
908
909 mov ip, #-16
910 add r4, r4, ip
911 vld1.64 {d30,d31}, [r4,:128], ip
912 vld1.64 {d20,d21}, [r4,:128], ip
913 vld1.64 {d18,d19}, [r4,:128], ip
914 vld1.64 {d16,d17}, [r4,:128], ip
915 vld1.64 {d14,d15}, [r4,:128], ip
916 vld1.64 {d12,d13}, [r4,:128], ip
917 vld1.64 {d10,d11}, [r4,:128], ip
918 vld1.64 {d8, d9}, [r4,:128], ip
919 vld1.64 {d6, d7}, [r4,:128], ip
920 vld1.64 {d4, d5}, [r4,:128], ip
921 vld1.64 {d2, d3}, [r4,:128], ip
922 vld1.64 {d0, d1}, [r4,:128]
923
924 swap4 d1, d3, d5, d7, d8, d10, d12, d14
925 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
926
927 swap4 d17, d19, d21, d31, d24, d26, d28, d22
928 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
929
930 vst1.64 {d30,d31}, [r4,:128]!
931 vst1.64 {d6, d7}, [r4,:128]!
932 vst1.64 {d20,d21}, [r4,:128]!
933 vst1.64 {d4, d5}, [r4,:128]!
934 vst1.64 {d18,d19}, [r4,:128]!
935 vst1.64 {d2, d3}, [r4,:128]!
936 vst1.64 {d16,d17}, [r4,:128]!
937 vst1.64 {d0, d1}, [r4,:128]
938
939 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
940 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
941 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
942 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
943
944 vld1.64 {d16,d17}, [r4,:128], ip
945 vld1.64 {d30,d31}, [r4,:128], ip
946 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
947 vld1.64 {d16,d17}, [r4,:128], ip
948 vld1.64 {d30,d31}, [r4,:128], ip
949 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
950 vld1.64 {d16,d17}, [r4,:128], ip
951 vld1.64 {d30,d31}, [r4,:128], ip
952 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
953 vld1.64 {d16,d17}, [r4,:128], ip
954 vld1.64 {d30,d31}, [r4,:128]
955 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
956
957 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
958
959 bx lr
960 .endfunc
961
962 function put_h264_qpel8_hv_lowpass_neon
963 mov r10, lr
964 bl put_h264_qpel8_hv_lowpass_neon_top
965 vst1.64 {d12}, [r0,:64], r2
966 vst1.64 {d13}, [r0,:64], r2
967 vst1.64 {d14}, [r0,:64], r2
968 vst1.64 {d15}, [r0,:64], r2
969 vst1.64 {d8}, [r0,:64], r2
970 vst1.64 {d9}, [r0,:64], r2
971 vst1.64 {d10}, [r0,:64], r2
972 vst1.64 {d11}, [r0,:64], r2
973
974 mov lr, r10
975 bx lr
976 .endfunc
977
978 function put_h264_qpel8_hv_lowpass_l2_neon
979 mov r10, lr
980 bl put_h264_qpel8_hv_lowpass_neon_top
981
982 vld1.64 {d0, d1}, [r2,:128]!
983 vld1.64 {d2, d3}, [r2,:128]!
984 vrhadd.u8 q0, q0, q6
985 vld1.64 {d4, d5}, [r2,:128]!
986 vrhadd.u8 q1, q1, q7
987 vld1.64 {d6, d7}, [r2,:128]!
988 vrhadd.u8 q2, q2, q4
989
990 vst1.64 {d0}, [r0,:64], r3
991 vrhadd.u8 q3, q3, q5
992 vst1.64 {d1}, [r0,:64], r3
993 vst1.64 {d2}, [r0,:64], r3
994 vst1.64 {d3}, [r0,:64], r3
995 vst1.64 {d4}, [r0,:64], r3
996 vst1.64 {d5}, [r0,:64], r3
997 vst1.64 {d6}, [r0,:64], r3
998 vst1.64 {d7}, [r0,:64], r3
999
1000 mov lr, r10
1001 bx lr
1002 .endfunc
1003
1004 function put_h264_qpel16_hv_lowpass_neon
1005 mov r9, lr
1006 bl put_h264_qpel8_hv_lowpass_neon
1007 sub r1, r1, r3, lsl #2
1008 bl put_h264_qpel8_hv_lowpass_neon
1009 sub r1, r1, r3, lsl #4
1010 sub r1, r1, r3, lsl #2
1011 add r1, r1, #8
1012 sub r0, r0, r2, lsl #4
1013 add r0, r0, #8
1014 bl put_h264_qpel8_hv_lowpass_neon
1015 sub r1, r1, r3, lsl #2
1016 mov lr, r9
1017 b put_h264_qpel8_hv_lowpass_neon
1018 .endfunc
1019
1020 function put_h264_qpel16_hv_lowpass_l2_neon
1021 mov r9, lr
1022 sub r2, r4, #256
1023 bl put_h264_qpel8_hv_lowpass_l2_neon
1024 sub r1, r1, r3, lsl #2
1025 bl put_h264_qpel8_hv_lowpass_l2_neon
1026 sub r1, r1, r3, lsl #4
1027 sub r1, r1, r3, lsl #2
1028 add r1, r1, #8
1029 sub r0, r0, r3, lsl #4
1030 add r0, r0, #8
1031 bl put_h264_qpel8_hv_lowpass_l2_neon
1032 sub r1, r1, r3, lsl #2
1033 mov lr, r9
1034 b put_h264_qpel8_hv_lowpass_l2_neon
1035 .endfunc
1036
1037 function ff_put_h264_qpel8_mc10_neon, export=1
1038 lowpass_const r3
1039 mov r3, r1
1040 sub r1, r1, #2
1041 mov ip, #8
1042 b put_h264_qpel8_h_lowpass_l2_neon
1043 .endfunc
1044
1045 function ff_put_h264_qpel8_mc20_neon, export=1
1046 lowpass_const r3
1047 sub r1, r1, #2
1048 mov r3, r2
1049 mov ip, #8
1050 b put_h264_qpel8_h_lowpass_neon
1051 .endfunc
1052
1053 function ff_put_h264_qpel8_mc30_neon, export=1
1054 lowpass_const r3
1055 add r3, r1, #1
1056 sub r1, r1, #2
1057 mov ip, #8
1058 b put_h264_qpel8_h_lowpass_l2_neon
1059 .endfunc
1060
1061 function ff_put_h264_qpel8_mc01_neon, export=1
1062 push {lr}
1063 mov ip, r1
1064 put_h264_qpel8_mc01:
1065 lowpass_const r3
1066 mov r3, r2
1067 sub r1, r1, r2, lsl #1
1068 vpush {d8-d15}
1069 bl put_h264_qpel8_v_lowpass_l2_neon
1070 vpop {d8-d15}
1071 pop {pc}
1072 .endfunc
1073
1074 function ff_put_h264_qpel8_mc11_neon, export=1
1075 push {r0, r1, r2, lr}
1076 put_h264_qpel8_mc11:
1077 lowpass_const r3
1078 sub sp, sp, #64
1079 mov r0, sp
1080 sub r1, r1, #2
1081 mov r3, #8
1082 mov ip, #8
1083 vpush {d8-d15}
1084 bl put_h264_qpel8_h_lowpass_neon
1085 ldrd r0, [sp, #128]
1086 mov r3, r2
1087 add ip, sp, #64
1088 sub r1, r1, r2, lsl #1
1089 mov r2, #8
1090 bl put_h264_qpel8_v_lowpass_l2_neon
1091 vpop {d8-d15}
1092 add sp, sp, #76
1093 pop {pc}
1094 .endfunc
1095
1096 function ff_put_h264_qpel8_mc21_neon, export=1
1097 push {r0, r1, r4, r10, r11, lr}
1098 put_h264_qpel8_mc21:
1099 lowpass_const r3
1100 mov r11, sp
1101 bic sp, sp, #15
1102 sub sp, sp, #(8*8+16*12)
1103 sub r1, r1, #2
1104 mov r3, #8
1105 mov r0, sp
1106 mov ip, #8
1107 vpush {d8-d15}
1108 bl put_h264_qpel8_h_lowpass_neon
1109 mov r4, r0
1110 ldrd r0, [r11]
1111 sub r1, r1, r2, lsl #1
1112 sub r1, r1, #2
1113 mov r3, r2
1114 sub r2, r4, #64
1115 bl put_h264_qpel8_hv_lowpass_l2_neon
1116 vpop {d8-d15}
1117 add sp, r11, #8
1118 pop {r4, r10, r11, pc}
1119 .endfunc
1120
1121 function ff_put_h264_qpel8_mc31_neon, export=1
1122 add r1, r1, #1
1123 push {r0, r1, r2, lr}
1124 sub r1, r1, #1
1125 b put_h264_qpel8_mc11
1126 .endfunc
1127
1128 function ff_put_h264_qpel8_mc02_neon, export=1
1129 push {lr}
1130 lowpass_const r3
1131 sub r1, r1, r2, lsl #1
1132 mov r3, r2
1133 vpush {d8-d15}
1134 bl put_h264_qpel8_v_lowpass_neon
1135 vpop {d8-d15}
1136 pop {pc}
1137 .endfunc
1138
1139 function ff_put_h264_qpel8_mc12_neon, export=1
1140 push {r0, r1, r4, r10, r11, lr}
1141 put_h264_qpel8_mc12:
1142 lowpass_const r3
1143 mov r11, sp
1144 bic sp, sp, #15
1145 sub sp, sp, #(8*8+16*12)
1146 sub r1, r1, r2, lsl #1
1147 mov r3, r2
1148 mov r2, #8
1149 mov r0, sp
1150 vpush {d8-d15}
1151 bl put_h264_qpel8_v_lowpass_neon
1152 mov r4, r0
1153 ldrd r0, [r11]
1154 sub r1, r1, r3, lsl #1
1155 sub r1, r1, #2
1156 sub r2, r4, #64
1157 bl put_h264_qpel8_hv_lowpass_l2_neon
1158 vpop {d8-d15}
1159 add sp, r11, #8
1160 pop {r4, r10, r11, pc}
1161 .endfunc
1162
1163 function ff_put_h264_qpel8_mc22_neon, export=1
1164 push {r4, r10, r11, lr}
1165 mov r11, sp
1166 bic sp, sp, #15
1167 sub r1, r1, r2, lsl #1
1168 sub r1, r1, #2
1169 mov r3, r2
1170 sub sp, sp, #(16*12)
1171 mov r4, sp
1172 vpush {d8-d15}
1173 bl put_h264_qpel8_hv_lowpass_neon
1174 vpop {d8-d15}
1175 mov sp, r11
1176 pop {r4, r10, r11, pc}
1177 .endfunc
1178
1179 function ff_put_h264_qpel8_mc32_neon, export=1
1180 push {r0, r1, r4, r10, r11, lr}
1181 add r1, r1, #1
1182 b put_h264_qpel8_mc12
1183 .endfunc
1184
1185 function ff_put_h264_qpel8_mc03_neon, export=1
1186 push {lr}
1187 add ip, r1, r2
1188 b put_h264_qpel8_mc01
1189 .endfunc
1190
1191 function ff_put_h264_qpel8_mc13_neon, export=1
1192 push {r0, r1, r2, lr}
1193 add r1, r1, r2
1194 b put_h264_qpel8_mc11
1195 .endfunc
1196
1197 function ff_put_h264_qpel8_mc23_neon, export=1
1198 push {r0, r1, r4, r10, r11, lr}
1199 add r1, r1, r2
1200 b put_h264_qpel8_mc21
1201 .endfunc
1202
1203 function ff_put_h264_qpel8_mc33_neon, export=1
1204 add r1, r1, #1
1205 push {r0, r1, r2, lr}
1206 add r1, r1, r2
1207 sub r1, r1, #1
1208 b put_h264_qpel8_mc11
1209 .endfunc
1210
1211 function ff_put_h264_qpel16_mc10_neon, export=1
1212 lowpass_const r3
1213 mov r3, r1
1214 sub r1, r1, #2
1215 b put_h264_qpel16_h_lowpass_l2_neon
1216 .endfunc
1217
1218 function ff_put_h264_qpel16_mc20_neon, export=1
1219 lowpass_const r3
1220 sub r1, r1, #2
1221 mov r3, r2
1222 b put_h264_qpel16_h_lowpass_neon
1223 .endfunc
1224
1225 function ff_put_h264_qpel16_mc30_neon, export=1
1226 lowpass_const r3
1227 add r3, r1, #1
1228 sub r1, r1, #2
1229 b put_h264_qpel16_h_lowpass_l2_neon
1230 .endfunc
1231
1232 function ff_put_h264_qpel16_mc01_neon, export=1
1233 push {r4, lr}
1234 mov ip, r1
1235 put_h264_qpel16_mc01:
1236 lowpass_const r3
1237 mov r3, r2
1238 sub r1, r1, r2, lsl #1
1239 vpush {d8-d15}
1240 bl put_h264_qpel16_v_lowpass_l2_neon
1241 vpop {d8-d15}
1242 pop {r4, pc}
1243 .endfunc
1244
1245 function ff_put_h264_qpel16_mc11_neon, export=1
1246 push {r0, r1, r4, lr}
1247 put_h264_qpel16_mc11:
1248 lowpass_const r3
1249 sub sp, sp, #256
1250 mov r0, sp
1251 sub r1, r1, #2
1252 mov r3, #16
1253 vpush {d8-d15}
1254 bl put_h264_qpel16_h_lowpass_neon
1255 add r0, sp, #256
1256 ldrd r0, [r0, #64]
1257 mov r3, r2
1258 add ip, sp, #64
1259 sub r1, r1, r2, lsl #1
1260 mov r2, #16
1261 bl put_h264_qpel16_v_lowpass_l2_neon
1262 vpop {d8-d15}
1263 add sp, sp, #(256+8)
1264 pop {r4, pc}
1265 .endfunc
1266
1267 function ff_put_h264_qpel16_mc21_neon, export=1
1268 push {r0, r1, r4-r5, r9-r11, lr}
1269 put_h264_qpel16_mc21:
1270 lowpass_const r3
1271 mov r11, sp
1272 bic sp, sp, #15
1273 sub sp, sp, #(16*16+16*12)
1274 sub r1, r1, #2
1275 mov r0, sp
1276 vpush {d8-d15}
1277 bl put_h264_qpel16_h_lowpass_neon_packed
1278 mov r4, r0
1279 ldrd r0, [r11]
1280 sub r1, r1, r2, lsl #1
1281 sub r1, r1, #2
1282 mov r3, r2
1283 bl put_h264_qpel16_hv_lowpass_l2_neon
1284 vpop {d8-d15}
1285 add sp, r11, #8
1286 pop {r4-r5, r9-r11, pc}
1287 .endfunc
1288
1289 function ff_put_h264_qpel16_mc31_neon, export=1
1290 add r1, r1, #1
1291 push {r0, r1, r4, lr}
1292 sub r1, r1, #1
1293 b put_h264_qpel16_mc11
1294 .endfunc
1295
1296 function ff_put_h264_qpel16_mc02_neon, export=1
1297 push {r4, lr}
1298 lowpass_const r3
1299 sub r1, r1, r2, lsl #1
1300 mov r3, r2
1301 vpush {d8-d15}
1302 bl put_h264_qpel16_v_lowpass_neon
1303 vpop {d8-d15}
1304 pop {r4, pc}
1305 .endfunc
1306
1307 function ff_put_h264_qpel16_mc12_neon, export=1
1308 push {r0, r1, r4-r5, r9-r11, lr}
1309 put_h264_qpel16_mc12:
1310 lowpass_const r3
1311 mov r11, sp
1312 bic sp, sp, #15
1313 sub sp, sp, #(16*16+16*12)
1314 sub r1, r1, r2, lsl #1
1315 mov r0, sp
1316 mov r3, r2
1317 vpush {d8-d15}
1318 bl put_h264_qpel16_v_lowpass_neon_packed
1319 mov r4, r0
1320 ldrd r0, [r11]
1321 sub r1, r1, r3, lsl #1
1322 sub r1, r1, #2
1323 mov r2, r3
1324 bl put_h264_qpel16_hv_lowpass_l2_neon
1325 vpop {d8-d15}
1326 add sp, r11, #8
1327 pop {r4-r5, r9-r11, pc}
1328 .endfunc
1329
1330 function ff_put_h264_qpel16_mc22_neon, export=1
1331 push {r4, r9-r11, lr}
1332 lowpass_const r3
1333 mov r11, sp
1334 bic sp, sp, #15
1335 sub r1, r1, r2, lsl #1
1336 sub r1, r1, #2
1337 mov r3, r2
1338 sub sp, sp, #(16*12)
1339 mov r4, sp
1340 vpush {d8-d15}
1341 bl put_h264_qpel16_hv_lowpass_neon
1342 vpop {d8-d15}
1343 mov sp, r11
1344 pop {r4, r9-r11, pc}
1345 .endfunc
1346
1347 function ff_put_h264_qpel16_mc32_neon, export=1
1348 push {r0, r1, r4-r5, r9-r11, lr}
1349 add r1, r1, #1
1350 b put_h264_qpel16_mc12
1351 .endfunc
1352
1353 function ff_put_h264_qpel16_mc03_neon, export=1
1354 push {r4, lr}
1355 add ip, r1, r2
1356 b put_h264_qpel16_mc01
1357 .endfunc
1358
1359 function ff_put_h264_qpel16_mc13_neon, export=1
1360 push {r0, r1, r4, lr}
1361 add r1, r1, r2
1362 b put_h264_qpel16_mc11
1363 .endfunc
1364
1365 function ff_put_h264_qpel16_mc23_neon, export=1
1366 push {r0, r1, r4-r5, r9-r11, lr}
1367 add r1, r1, r2
1368 b put_h264_qpel16_mc21
1369 .endfunc
1370
1371 function ff_put_h264_qpel16_mc33_neon, export=1
1372 add r1, r1, #1
1373 push {r0, r1, r4, lr}
1374 add r1, r1, r2
1375 sub r1, r1, #1
1376 b put_h264_qpel16_mc11
1377 .endfunc