Mercurial > libavcodec.hg
comparison arm/h264dsp_neon.S @ 8359:9281a8a9387a libavcodec
ARM: replace "armv4l" with "arm"
author | mru |
---|---|
date | Wed, 17 Dec 2008 00:54:54 +0000 |
parents | armv4l/h264dsp_neon.S@b294a0d5bc50 |
children | 8d425ee85ddb |
comparison
equal
deleted
inserted
replaced
8358:c30b92cf446b | 8359:9281a8a9387a |
---|---|
1 /* | |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
23 .fpu neon | |
24 | |
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 | |
26 vtrn.32 \r0, \r4 | |
27 vtrn.32 \r1, \r5 | |
28 vtrn.32 \r2, \r6 | |
29 vtrn.32 \r3, \r7 | |
30 vtrn.16 \r0, \r2 | |
31 vtrn.16 \r1, \r3 | |
32 vtrn.16 \r4, \r6 | |
33 vtrn.16 \r5, \r7 | |
34 vtrn.8 \r0, \r1 | |
35 vtrn.8 \r2, \r3 | |
36 vtrn.8 \r4, \r5 | |
37 vtrn.8 \r6, \r7 | |
38 .endm | |
39 | |
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 | |
41 vswp \r0, \r4 | |
42 vswp \r1, \r5 | |
43 vswp \r2, \r6 | |
44 vswp \r3, \r7 | |
45 .endm | |
46 | |
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
48 vtrn.32 \r0, \r2 | |
49 vtrn.32 \r1, \r3 | |
50 vtrn.32 \r4, \r6 | |
51 vtrn.32 \r5, \r7 | |
52 vtrn.16 \r0, \r1 | |
53 vtrn.16 \r2, \r3 | |
54 vtrn.16 \r4, \r5 | |
55 vtrn.16 \r6, \r7 | |
56 .endm | |
57 | |
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
59 .macro h264_chroma_mc8 avg=0 | |
60 push {r4-r7, lr} | |
61 ldrd r4, [sp, #20] | |
62 .if \avg | |
63 mov lr, r0 | |
64 .endif | |
65 pld [r1] | |
66 pld [r1, r2] | |
67 | |
68 muls r7, r4, r5 | |
69 rsb r6, r7, r5, lsl #3 | |
70 rsb ip, r7, r4, lsl #3 | |
71 sub r4, r7, r4, lsl #3 | |
72 sub r4, r4, r5, lsl #3 | |
73 add r4, r4, #64 | |
74 | |
75 beq 2f | |
76 | |
77 add r5, r1, r2 | |
78 | |
79 vdup.8 d0, r4 | |
80 lsl r4, r2, #1 | |
81 vdup.8 d1, ip | |
82 vld1.64 {d4, d5}, [r1], r4 | |
83 vdup.8 d2, r6 | |
84 vld1.64 {d6, d7}, [r5], r4 | |
85 vdup.8 d3, r7 | |
86 | |
87 vext.8 d5, d4, d5, #1 | |
88 vext.8 d7, d6, d7, #1 | |
89 | |
90 1: pld [r5] | |
91 vmull.u8 q8, d4, d0 | |
92 vmlal.u8 q8, d5, d1 | |
93 vld1.64 {d4, d5}, [r1], r4 | |
94 vmlal.u8 q8, d6, d2 | |
95 vext.8 d5, d4, d5, #1 | |
96 vmlal.u8 q8, d7, d3 | |
97 vmull.u8 q9, d6, d0 | |
98 subs r3, r3, #2 | |
99 vmlal.u8 q9, d7, d1 | |
100 vmlal.u8 q9, d4, d2 | |
101 vmlal.u8 q9, d5, d3 | |
102 vrshrn.u16 d16, q8, #6 | |
103 vld1.64 {d6, d7}, [r5], r4 | |
104 pld [r1] | |
105 vrshrn.u16 d17, q9, #6 | |
106 .if \avg | |
107 vld1.64 {d20}, [lr,:64], r2 | |
108 vld1.64 {d21}, [lr,:64], r2 | |
109 vrhadd.u8 q8, q8, q10 | |
110 .endif | |
111 vext.8 d7, d6, d7, #1 | |
112 vst1.64 {d16}, [r0,:64], r2 | |
113 vst1.64 {d17}, [r0,:64], r2 | |
114 bgt 1b | |
115 | |
116 pop {r4-r7, pc} | |
117 | |
118 2: tst r6, r6 | |
119 add ip, ip, r6 | |
120 vdup.8 d0, r4 | |
121 vdup.8 d1, ip | |
122 | |
123 beq 4f | |
124 | |
125 add r5, r1, r2 | |
126 lsl r4, r2, #1 | |
127 vld1.64 {d4}, [r1], r4 | |
128 vld1.64 {d6}, [r5], r4 | |
129 | |
130 3: pld [r5] | |
131 vmull.u8 q8, d4, d0 | |
132 vmlal.u8 q8, d6, d1 | |
133 vld1.64 {d4}, [r1], r4 | |
134 vmull.u8 q9, d6, d0 | |
135 vmlal.u8 q9, d4, d1 | |
136 vld1.64 {d6}, [r5], r4 | |
137 vrshrn.u16 d16, q8, #6 | |
138 vrshrn.u16 d17, q9, #6 | |
139 .if \avg | |
140 vld1.64 {d20}, [lr,:64], r2 | |
141 vld1.64 {d21}, [lr,:64], r2 | |
142 vrhadd.u8 q8, q8, q10 | |
143 .endif | |
144 subs r3, r3, #2 | |
145 pld [r1] | |
146 vst1.64 {d16}, [r0,:64], r2 | |
147 vst1.64 {d17}, [r0,:64], r2 | |
148 bgt 3b | |
149 | |
150 pop {r4-r7, pc} | |
151 | |
152 4: vld1.64 {d4, d5}, [r1], r2 | |
153 vld1.64 {d6, d7}, [r1], r2 | |
154 vext.8 d5, d4, d5, #1 | |
155 vext.8 d7, d6, d7, #1 | |
156 | |
157 5: pld [r1] | |
158 subs r3, r3, #2 | |
159 vmull.u8 q8, d4, d0 | |
160 vmlal.u8 q8, d5, d1 | |
161 vld1.64 {d4, d5}, [r1], r2 | |
162 vmull.u8 q9, d6, d0 | |
163 vmlal.u8 q9, d7, d1 | |
164 pld [r1] | |
165 vext.8 d5, d4, d5, #1 | |
166 vrshrn.u16 d16, q8, #6 | |
167 vrshrn.u16 d17, q9, #6 | |
168 .if \avg | |
169 vld1.64 {d20}, [lr,:64], r2 | |
170 vld1.64 {d21}, [lr,:64], r2 | |
171 vrhadd.u8 q8, q8, q10 | |
172 .endif | |
173 vld1.64 {d6, d7}, [r1], r2 | |
174 vext.8 d7, d6, d7, #1 | |
175 vst1.64 {d16}, [r0,:64], r2 | |
176 vst1.64 {d17}, [r0,:64], r2 | |
177 bgt 5b | |
178 | |
179 pop {r4-r7, pc} | |
180 .endm | |
181 | |
182 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
183 .macro h264_chroma_mc4 avg=0 | |
184 push {r4-r7, lr} | |
185 ldrd r4, [sp, #20] | |
186 .if \avg | |
187 mov lr, r0 | |
188 .endif | |
189 pld [r1] | |
190 pld [r1, r2] | |
191 | |
192 muls r7, r4, r5 | |
193 rsb r6, r7, r5, lsl #3 | |
194 rsb ip, r7, r4, lsl #3 | |
195 sub r4, r7, r4, lsl #3 | |
196 sub r4, r4, r5, lsl #3 | |
197 add r4, r4, #64 | |
198 | |
199 beq 2f | |
200 | |
201 add r5, r1, r2 | |
202 | |
203 vdup.8 d0, r4 | |
204 lsl r4, r2, #1 | |
205 vdup.8 d1, ip | |
206 vld1.64 {d4}, [r1], r4 | |
207 vdup.8 d2, r6 | |
208 vld1.64 {d6}, [r5], r4 | |
209 vdup.8 d3, r7 | |
210 | |
211 vext.8 d5, d4, d5, #1 | |
212 vext.8 d7, d6, d7, #1 | |
213 vtrn.32 d4, d5 | |
214 vtrn.32 d6, d7 | |
215 | |
216 vtrn.32 d0, d1 | |
217 vtrn.32 d2, d3 | |
218 | |
219 1: pld [r5] | |
220 vmull.u8 q8, d4, d0 | |
221 vmlal.u8 q8, d6, d2 | |
222 vld1.64 {d4}, [r1], r4 | |
223 vext.8 d5, d4, d5, #1 | |
224 vtrn.32 d4, d5 | |
225 vmull.u8 q9, d6, d0 | |
226 vmlal.u8 q9, d4, d2 | |
227 vld1.64 {d6}, [r5], r4 | |
228 vadd.i16 d16, d16, d17 | |
229 vadd.i16 d17, d18, d19 | |
230 vrshrn.u16 d16, q8, #6 | |
231 subs r3, r3, #2 | |
232 pld [r1] | |
233 .if \avg | |
234 vld1.32 {d20[0]}, [lr,:32], r2 | |
235 vld1.32 {d20[1]}, [lr,:32], r2 | |
236 vrhadd.u8 d16, d16, d20 | |
237 .endif | |
238 vext.8 d7, d6, d7, #1 | |
239 vtrn.32 d6, d7 | |
240 vst1.32 {d16[0]}, [r0,:32], r2 | |
241 vst1.32 {d16[1]}, [r0,:32], r2 | |
242 bgt 1b | |
243 | |
244 pop {r4-r7, pc} | |
245 | |
246 2: tst r6, r6 | |
247 add ip, ip, r6 | |
248 vdup.8 d0, r4 | |
249 vdup.8 d1, ip | |
250 vtrn.32 d0, d1 | |
251 | |
252 beq 4f | |
253 | |
254 vext.32 d1, d0, d1, #1 | |
255 add r5, r1, r2 | |
256 lsl r4, r2, #1 | |
257 vld1.32 {d4[0]}, [r1], r4 | |
258 vld1.32 {d4[1]}, [r5], r4 | |
259 | |
260 3: pld [r5] | |
261 vmull.u8 q8, d4, d0 | |
262 vld1.32 {d4[0]}, [r1], r4 | |
263 vmull.u8 q9, d4, d1 | |
264 vld1.32 {d4[1]}, [r5], r4 | |
265 vadd.i16 d16, d16, d17 | |
266 vadd.i16 d17, d18, d19 | |
267 vrshrn.u16 d16, q8, #6 | |
268 .if \avg | |
269 vld1.32 {d20[0]}, [lr,:32], r2 | |
270 vld1.32 {d20[1]}, [lr,:32], r2 | |
271 vrhadd.u8 d16, d16, d20 | |
272 .endif | |
273 subs r3, r3, #2 | |
274 pld [r1] | |
275 vst1.32 {d16[0]}, [r0,:32], r2 | |
276 vst1.32 {d16[1]}, [r0,:32], r2 | |
277 bgt 3b | |
278 | |
279 pop {r4-r7, pc} | |
280 | |
281 4: vld1.64 {d4}, [r1], r2 | |
282 vld1.64 {d6}, [r1], r2 | |
283 vext.8 d5, d4, d5, #1 | |
284 vext.8 d7, d6, d7, #1 | |
285 vtrn.32 d4, d5 | |
286 vtrn.32 d6, d7 | |
287 | |
288 5: vmull.u8 q8, d4, d0 | |
289 vmull.u8 q9, d6, d0 | |
290 subs r3, r3, #2 | |
291 vld1.64 {d4}, [r1], r2 | |
292 vext.8 d5, d4, d5, #1 | |
293 vtrn.32 d4, d5 | |
294 vadd.i16 d16, d16, d17 | |
295 vadd.i16 d17, d18, d19 | |
296 pld [r1] | |
297 vrshrn.u16 d16, q8, #6 | |
298 .if \avg | |
299 vld1.32 {d20[0]}, [lr,:32], r2 | |
300 vld1.32 {d20[1]}, [lr,:32], r2 | |
301 vrhadd.u8 d16, d16, d20 | |
302 .endif | |
303 vld1.64 {d6}, [r1], r2 | |
304 vext.8 d7, d6, d7, #1 | |
305 vtrn.32 d6, d7 | |
306 pld [r1] | |
307 vst1.32 {d16[0]}, [r0,:32], r2 | |
308 vst1.32 {d16[1]}, [r0,:32], r2 | |
309 bgt 5b | |
310 | |
311 pop {r4-r7, pc} | |
312 .endm | |
313 | |
314 .text | |
315 .align | |
316 | |
317 function ff_put_h264_chroma_mc8_neon, export=1 | |
318 h264_chroma_mc8 | |
319 .endfunc | |
320 | |
321 function ff_avg_h264_chroma_mc8_neon, export=1 | |
322 h264_chroma_mc8 avg=1 | |
323 .endfunc | |
324 | |
325 function ff_put_h264_chroma_mc4_neon, export=1 | |
326 h264_chroma_mc4 | |
327 .endfunc | |
328 | |
329 function ff_avg_h264_chroma_mc4_neon, export=1 | |
330 h264_chroma_mc4 avg=1 | |
331 .endfunc | |
332 | |
333 /* H.264 loop filter */ | |
334 | |
335 .macro h264_loop_filter_start | |
336 ldr ip, [sp] | |
337 tst r2, r2 | |
338 ldr ip, [ip] | |
339 tstne r3, r3 | |
340 vmov.32 d24[0], ip | |
341 and ip, ip, ip, lsl #16 | |
342 bxeq lr | |
343 ands ip, ip, ip, lsl #8 | |
344 bxlt lr | |
345 .endm | |
346 | |
347 .macro align_push_regs | |
348 and ip, sp, #15 | |
349 add ip, ip, #32 | |
350 sub sp, sp, ip | |
351 vst1.64 {d12-d15}, [sp,:128] | |
352 sub sp, sp, #32 | |
353 vst1.64 {d8-d11}, [sp,:128] | |
354 .endm | |
355 | |
356 .macro align_pop_regs | |
357 vld1.64 {d8-d11}, [sp,:128]! | |
358 vld1.64 {d12-d15}, [sp,:128], ip | |
359 .endm | |
360 | |
361 .macro h264_loop_filter_luma | |
362 vdup.8 q11, r2 @ alpha | |
363 vmovl.u8 q12, d24 | |
364 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
365 vmovl.u16 q12, d24 | |
366 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
367 vsli.16 q12, q12, #8 | |
368 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
369 vsli.32 q12, q12, #16 | |
370 vclt.u8 q6, q6, q11 @ < alpha | |
371 vdup.8 q11, r3 @ beta | |
372 vclt.s8 q7, q12, #0 | |
373 vclt.u8 q14, q14, q11 @ < beta | |
374 vclt.u8 q15, q15, q11 @ < beta | |
375 vbic q6, q6, q7 | |
376 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
377 vand q6, q6, q14 | |
378 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
379 vclt.u8 q4, q4, q11 @ < beta | |
380 vand q6, q6, q15 | |
381 vclt.u8 q5, q5, q11 @ < beta | |
382 vand q4, q4, q6 | |
383 vand q5, q5, q6 | |
384 vand q12, q12, q6 | |
385 vrhadd.u8 q14, q8, q0 | |
386 vsub.i8 q6, q12, q4 | |
387 vqadd.u8 q7, q9, q12 | |
388 vhadd.u8 q10, q10, q14 | |
389 vsub.i8 q6, q6, q5 | |
390 vhadd.u8 q14, q2, q14 | |
391 vmin.u8 q7, q7, q10 | |
392 vqsub.u8 q11, q9, q12 | |
393 vqadd.u8 q2, q1, q12 | |
394 vmax.u8 q7, q7, q11 | |
395 vqsub.u8 q11, q1, q12 | |
396 vmin.u8 q14, q2, q14 | |
397 vmovl.u8 q2, d0 | |
398 vmax.u8 q14, q14, q11 | |
399 vmovl.u8 q10, d1 | |
400 vsubw.u8 q2, q2, d16 | |
401 vsubw.u8 q10, q10, d17 | |
402 vshl.i16 q2, q2, #2 | |
403 vshl.i16 q10, q10, #2 | |
404 vaddw.u8 q2, q2, d18 | |
405 vaddw.u8 q10, q10, d19 | |
406 vsubw.u8 q2, q2, d2 | |
407 vsubw.u8 q10, q10, d3 | |
408 vrshrn.i16 d4, q2, #3 | |
409 vrshrn.i16 d5, q10, #3 | |
410 vbsl q4, q7, q9 | |
411 vbsl q5, q14, q1 | |
412 vneg.s8 q7, q6 | |
413 vmovl.u8 q14, d16 | |
414 vmin.s8 q2, q2, q6 | |
415 vmovl.u8 q6, d17 | |
416 vmax.s8 q2, q2, q7 | |
417 vmovl.u8 q11, d0 | |
418 vmovl.u8 q12, d1 | |
419 vaddw.s8 q14, q14, d4 | |
420 vaddw.s8 q6, q6, d5 | |
421 vsubw.s8 q11, q11, d4 | |
422 vsubw.s8 q12, q12, d5 | |
423 vqmovun.s16 d16, q14 | |
424 vqmovun.s16 d17, q6 | |
425 vqmovun.s16 d0, q11 | |
426 vqmovun.s16 d1, q12 | |
427 .endm | |
428 | |
429 function ff_h264_v_loop_filter_luma_neon, export=1 | |
430 h264_loop_filter_start | |
431 | |
432 vld1.64 {d0, d1}, [r0,:128], r1 | |
433 vld1.64 {d2, d3}, [r0,:128], r1 | |
434 vld1.64 {d4, d5}, [r0,:128], r1 | |
435 sub r0, r0, r1, lsl #2 | |
436 sub r0, r0, r1, lsl #1 | |
437 vld1.64 {d20,d21}, [r0,:128], r1 | |
438 vld1.64 {d18,d19}, [r0,:128], r1 | |
439 vld1.64 {d16,d17}, [r0,:128], r1 | |
440 | |
441 align_push_regs | |
442 | |
443 h264_loop_filter_luma | |
444 | |
445 sub r0, r0, r1, lsl #1 | |
446 vst1.64 {d8, d9}, [r0,:128], r1 | |
447 vst1.64 {d16,d17}, [r0,:128], r1 | |
448 vst1.64 {d0, d1}, [r0,:128], r1 | |
449 vst1.64 {d10,d11}, [r0,:128] | |
450 | |
451 align_pop_regs | |
452 bx lr | |
453 .endfunc | |
454 | |
455 function ff_h264_h_loop_filter_luma_neon, export=1 | |
456 h264_loop_filter_start | |
457 | |
458 sub r0, r0, #4 | |
459 vld1.64 {d6}, [r0], r1 | |
460 vld1.64 {d20}, [r0], r1 | |
461 vld1.64 {d18}, [r0], r1 | |
462 vld1.64 {d16}, [r0], r1 | |
463 vld1.64 {d0}, [r0], r1 | |
464 vld1.64 {d2}, [r0], r1 | |
465 vld1.64 {d4}, [r0], r1 | |
466 vld1.64 {d26}, [r0], r1 | |
467 vld1.64 {d7}, [r0], r1 | |
468 vld1.64 {d21}, [r0], r1 | |
469 vld1.64 {d19}, [r0], r1 | |
470 vld1.64 {d17}, [r0], r1 | |
471 vld1.64 {d1}, [r0], r1 | |
472 vld1.64 {d3}, [r0], r1 | |
473 vld1.64 {d5}, [r0], r1 | |
474 vld1.64 {d27}, [r0], r1 | |
475 | |
476 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 | |
477 | |
478 align_push_regs | |
479 sub sp, sp, #16 | |
480 vst1.64 {d4, d5}, [sp,:128] | |
481 sub sp, sp, #16 | |
482 vst1.64 {d20,d21}, [sp,:128] | |
483 | |
484 h264_loop_filter_luma | |
485 | |
486 vld1.64 {d20,d21}, [sp,:128]! | |
487 vld1.64 {d4, d5}, [sp,:128]! | |
488 | |
489 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13 | |
490 | |
491 sub r0, r0, r1, lsl #4 | |
492 vst1.64 {d6}, [r0], r1 | |
493 vst1.64 {d20}, [r0], r1 | |
494 vst1.64 {d8}, [r0], r1 | |
495 vst1.64 {d16}, [r0], r1 | |
496 vst1.64 {d0}, [r0], r1 | |
497 vst1.64 {d10}, [r0], r1 | |
498 vst1.64 {d4}, [r0], r1 | |
499 vst1.64 {d26}, [r0], r1 | |
500 vst1.64 {d7}, [r0], r1 | |
501 vst1.64 {d21}, [r0], r1 | |
502 vst1.64 {d9}, [r0], r1 | |
503 vst1.64 {d17}, [r0], r1 | |
504 vst1.64 {d1}, [r0], r1 | |
505 vst1.64 {d11}, [r0], r1 | |
506 vst1.64 {d5}, [r0], r1 | |
507 vst1.64 {d27}, [r0], r1 | |
508 | |
509 align_pop_regs | |
510 bx lr | |
511 .endfunc | |
512 | |
513 .macro h264_loop_filter_chroma | |
514 vdup.8 d22, r2 @ alpha | |
515 vmovl.u8 q12, d24 | |
516 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
517 vmovl.u8 q2, d0 | |
518 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
519 vsubw.u8 q2, q2, d16 | |
520 vsli.16 d24, d24, #8 | |
521 vshl.i16 q2, q2, #2 | |
522 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
523 vaddw.u8 q2, q2, d18 | |
524 vclt.u8 d26, d26, d22 @ < alpha | |
525 vsubw.u8 q2, q2, d2 | |
526 vdup.8 d22, r3 @ beta | |
527 vclt.s8 d25, d24, #0 | |
528 vrshrn.i16 d4, q2, #3 | |
529 vclt.u8 d28, d28, d22 @ < beta | |
530 vbic d26, d26, d25 | |
531 vclt.u8 d30, d30, d22 @ < beta | |
532 vand d26, d26, d28 | |
533 vneg.s8 d25, d24 | |
534 vand d26, d26, d30 | |
535 vmin.s8 d4, d4, d24 | |
536 vmovl.u8 q14, d16 | |
537 vand d4, d4, d26 | |
538 vmax.s8 d4, d4, d25 | |
539 vmovl.u8 q11, d0 | |
540 vaddw.s8 q14, q14, d4 | |
541 vsubw.s8 q11, q11, d4 | |
542 vqmovun.s16 d16, q14 | |
543 vqmovun.s16 d0, q11 | |
544 .endm | |
545 | |
546 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
547 h264_loop_filter_start | |
548 | |
549 sub r0, r0, r1, lsl #1 | |
550 vld1.64 {d18}, [r0,:64], r1 | |
551 vld1.64 {d16}, [r0,:64], r1 | |
552 vld1.64 {d0}, [r0,:64], r1 | |
553 vld1.64 {d2}, [r0,:64] | |
554 | |
555 h264_loop_filter_chroma | |
556 | |
557 sub r0, r0, r1, lsl #1 | |
558 vst1.64 {d16}, [r0,:64], r1 | |
559 vst1.64 {d0}, [r0,:64], r1 | |
560 | |
561 bx lr | |
562 .endfunc | |
563 | |
564 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
565 h264_loop_filter_start | |
566 | |
567 sub r0, r0, #2 | |
568 vld1.32 {d18[0]}, [r0], r1 | |
569 vld1.32 {d16[0]}, [r0], r1 | |
570 vld1.32 {d0[0]}, [r0], r1 | |
571 vld1.32 {d2[0]}, [r0], r1 | |
572 vld1.32 {d18[1]}, [r0], r1 | |
573 vld1.32 {d16[1]}, [r0], r1 | |
574 vld1.32 {d0[1]}, [r0], r1 | |
575 vld1.32 {d2[1]}, [r0], r1 | |
576 | |
577 vtrn.16 d18, d0 | |
578 vtrn.16 d16, d2 | |
579 vtrn.8 d18, d16 | |
580 vtrn.8 d0, d2 | |
581 | |
582 h264_loop_filter_chroma | |
583 | |
584 vtrn.16 d18, d0 | |
585 vtrn.16 d16, d2 | |
586 vtrn.8 d18, d16 | |
587 vtrn.8 d0, d2 | |
588 | |
589 sub r0, r0, r1, lsl #3 | |
590 vst1.32 {d18[0]}, [r0], r1 | |
591 vst1.32 {d16[0]}, [r0], r1 | |
592 vst1.32 {d0[0]}, [r0], r1 | |
593 vst1.32 {d2[0]}, [r0], r1 | |
594 vst1.32 {d18[1]}, [r0], r1 | |
595 vst1.32 {d16[1]}, [r0], r1 | |
596 vst1.32 {d0[1]}, [r0], r1 | |
597 vst1.32 {d2[1]}, [r0], r1 | |
598 | |
599 bx lr | |
600 .endfunc | |
601 | |
602 /* H.264 qpel MC */ | |
603 | |
604 .macro lowpass_const r | |
605 movw \r, #5 | |
606 movt \r, #20 | |
607 vmov.32 d6[0], \r | |
608 .endm | |
609 | |
610 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
611 .if \narrow | |
612 t0 .req q0 | |
613 t1 .req q8 | |
614 .else | |
615 t0 .req \d0 | |
616 t1 .req \d1 | |
617 .endif | |
618 vext.8 d2, \r0, \r1, #2 | |
619 vext.8 d3, \r0, \r1, #3 | |
620 vaddl.u8 q1, d2, d3 | |
621 vext.8 d4, \r0, \r1, #1 | |
622 vext.8 d5, \r0, \r1, #4 | |
623 vaddl.u8 q2, d4, d5 | |
624 vext.8 d30, \r0, \r1, #5 | |
625 vaddl.u8 t0, \r0, d30 | |
626 vext.8 d18, \r2, \r3, #2 | |
627 vmla.i16 t0, q1, d6[1] | |
628 vext.8 d19, \r2, \r3, #3 | |
629 vaddl.u8 q9, d18, d19 | |
630 vext.8 d20, \r2, \r3, #1 | |
631 vmls.i16 t0, q2, d6[0] | |
632 vext.8 d21, \r2, \r3, #4 | |
633 vaddl.u8 q10, d20, d21 | |
634 vext.8 d31, \r2, \r3, #5 | |
635 vaddl.u8 t1, \r2, d31 | |
636 vmla.i16 t1, q9, d6[1] | |
637 vmls.i16 t1, q10, d6[0] | |
638 .if \narrow | |
639 vqrshrun.s16 \d0, t0, #5 | |
640 vqrshrun.s16 \d1, t1, #5 | |
641 .endif | |
642 .unreq t0 | |
643 .unreq t1 | |
644 .endm | |
645 | |
646 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
647 .if \narrow | |
648 t0 .req q0 | |
649 .else | |
650 t0 .req \d0 | |
651 .endif | |
652 vext.8 d2, \r0, \r1, #2 | |
653 vext.8 d3, \r0, \r1, #3 | |
654 vaddl.u8 q1, d2, d3 | |
655 vext.8 d4, \r0, \r1, #1 | |
656 vext.8 d5, \r0, \r1, #4 | |
657 vaddl.u8 q2, d4, d5 | |
658 vext.8 d30, \r0, \r1, #5 | |
659 vaddl.u8 t0, \r0, d30 | |
660 vmla.i16 t0, q1, d6[1] | |
661 vmls.i16 t0, q2, d6[0] | |
662 .if \narrow | |
663 vqrshrun.s16 \d0, t0, #5 | |
664 .endif | |
665 .unreq t0 | |
666 .endm | |
667 | |
668 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
669 vext.16 q1, \r0, \r1, #2 | |
670 vext.16 q0, \r0, \r1, #3 | |
671 vaddl.s16 q9, d2, d0 | |
672 vext.16 q2, \r0, \r1, #1 | |
673 vaddl.s16 q1, d3, d1 | |
674 vext.16 q3, \r0, \r1, #4 | |
675 vaddl.s16 q10, d4, d6 | |
676 vext.16 \r1, \r0, \r1, #5 | |
677 vaddl.s16 q2, d5, d7 | |
678 vaddl.s16 q0, \h0, \h1 | |
679 vaddl.s16 q8, \l0, \l1 | |
680 | |
681 vshl.i32 q3, q9, #4 | |
682 vshl.i32 q9, q9, #2 | |
683 vshl.i32 q15, q10, #2 | |
684 vadd.i32 q9, q9, q3 | |
685 vadd.i32 q10, q10, q15 | |
686 | |
687 vshl.i32 q3, q1, #4 | |
688 vshl.i32 q1, q1, #2 | |
689 vshl.i32 q15, q2, #2 | |
690 vadd.i32 q1, q1, q3 | |
691 vadd.i32 q2, q2, q15 | |
692 | |
693 vadd.i32 q9, q9, q8 | |
694 vsub.i32 q9, q9, q10 | |
695 | |
696 vadd.i32 q1, q1, q0 | |
697 vsub.i32 q1, q1, q2 | |
698 | |
699 vrshrn.s32 d18, q9, #10 | |
700 vrshrn.s32 d19, q1, #10 | |
701 | |
702 vqmovun.s16 \d, q9 | |
703 .endm | |
704 | |
705 function put_h264_qpel16_h_lowpass_neon_packed | |
706 mov r4, lr | |
707 mov ip, #16 | |
708 mov r3, #8 | |
709 bl put_h264_qpel8_h_lowpass_neon | |
710 sub r1, r1, r2, lsl #4 | |
711 add r1, r1, #8 | |
712 mov ip, #16 | |
713 mov lr, r4 | |
714 b put_h264_qpel8_h_lowpass_neon | |
715 .endfunc | |
716 | |
717 function put_h264_qpel16_h_lowpass_neon | |
718 push {lr} | |
719 mov ip, #16 | |
720 bl put_h264_qpel8_h_lowpass_neon | |
721 sub r0, r0, r3, lsl #4 | |
722 sub r1, r1, r2, lsl #4 | |
723 add r0, r0, #8 | |
724 add r1, r1, #8 | |
725 mov ip, #16 | |
726 pop {lr} | |
727 .endfunc | |
728 | |
729 function put_h264_qpel8_h_lowpass_neon | |
730 1: vld1.64 {d0, d1}, [r1], r2 | |
731 vld1.64 {d16,d17}, [r1], r2 | |
732 subs ip, ip, #2 | |
733 lowpass_8 d0, d1, d16, d17, d0, d16 | |
734 vst1.64 {d0}, [r0,:64], r3 | |
735 vst1.64 {d16}, [r0,:64], r3 | |
736 bne 1b | |
737 bx lr | |
738 .endfunc | |
739 | |
740 function put_h264_qpel16_h_lowpass_l2_neon | |
741 push {lr} | |
742 mov ip, #16 | |
743 bl put_h264_qpel8_h_lowpass_l2_neon | |
744 sub r0, r0, r2, lsl #4 | |
745 sub r1, r1, r2, lsl #4 | |
746 sub r3, r3, r2, lsl #4 | |
747 add r0, r0, #8 | |
748 add r1, r1, #8 | |
749 add r3, r3, #8 | |
750 mov ip, #16 | |
751 pop {lr} | |
752 .endfunc | |
753 | |
754 function put_h264_qpel8_h_lowpass_l2_neon | |
755 1: vld1.64 {d0, d1}, [r1], r2 | |
756 vld1.64 {d16,d17}, [r1], r2 | |
757 vld1.64 {d28}, [r3], r2 | |
758 vld1.64 {d29}, [r3], r2 | |
759 subs ip, ip, #2 | |
760 lowpass_8 d0, d1, d16, d17, d0, d1 | |
761 vrhadd.u8 q0, q0, q14 | |
762 vst1.64 {d0}, [r0,:64], r2 | |
763 vst1.64 {d1}, [r0,:64], r2 | |
764 bne 1b | |
765 bx lr | |
766 .endfunc | |
767 | |
768 function put_h264_qpel16_v_lowpass_neon_packed | |
769 mov r4, lr | |
770 mov r2, #8 | |
771 bl put_h264_qpel8_v_lowpass_neon | |
772 sub r1, r1, r3, lsl #2 | |
773 bl put_h264_qpel8_v_lowpass_neon | |
774 sub r1, r1, r3, lsl #4 | |
775 sub r1, r1, r3, lsl #2 | |
776 add r1, r1, #8 | |
777 bl put_h264_qpel8_v_lowpass_neon | |
778 sub r1, r1, r3, lsl #2 | |
779 mov lr, r4 | |
780 b put_h264_qpel8_v_lowpass_neon | |
781 .endfunc | |
782 | |
783 function put_h264_qpel16_v_lowpass_neon | |
784 mov r4, lr | |
785 bl put_h264_qpel8_v_lowpass_neon | |
786 sub r1, r1, r3, lsl #2 | |
787 bl put_h264_qpel8_v_lowpass_neon | |
788 sub r0, r0, r2, lsl #4 | |
789 add r0, r0, #8 | |
790 sub r1, r1, r3, lsl #4 | |
791 sub r1, r1, r3, lsl #2 | |
792 add r1, r1, #8 | |
793 bl put_h264_qpel8_v_lowpass_neon | |
794 sub r1, r1, r3, lsl #2 | |
795 mov lr, r4 | |
796 .endfunc | |
797 | |
798 function put_h264_qpel8_v_lowpass_neon | |
799 vld1.64 {d8}, [r1], r3 | |
800 vld1.64 {d10}, [r1], r3 | |
801 vld1.64 {d12}, [r1], r3 | |
802 vld1.64 {d14}, [r1], r3 | |
803 vld1.64 {d22}, [r1], r3 | |
804 vld1.64 {d24}, [r1], r3 | |
805 vld1.64 {d26}, [r1], r3 | |
806 vld1.64 {d28}, [r1], r3 | |
807 vld1.64 {d9}, [r1], r3 | |
808 vld1.64 {d11}, [r1], r3 | |
809 vld1.64 {d13}, [r1], r3 | |
810 vld1.64 {d15}, [r1], r3 | |
811 vld1.64 {d23}, [r1] | |
812 | |
813 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
814 lowpass_8 d8, d9, d10, d11, d8, d10 | |
815 lowpass_8 d12, d13, d14, d15, d12, d14 | |
816 lowpass_8 d22, d23, d24, d25, d22, d24 | |
817 lowpass_8 d26, d27, d28, d29, d26, d28 | |
818 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
819 | |
820 vst1.64 {d8}, [r0,:64], r2 | |
821 vst1.64 {d10}, [r0,:64], r2 | |
822 vst1.64 {d12}, [r0,:64], r2 | |
823 vst1.64 {d14}, [r0,:64], r2 | |
824 vst1.64 {d22}, [r0,:64], r2 | |
825 vst1.64 {d24}, [r0,:64], r2 | |
826 vst1.64 {d26}, [r0,:64], r2 | |
827 vst1.64 {d28}, [r0,:64], r2 | |
828 | |
829 bx lr | |
830 .endfunc | |
831 | |
832 function put_h264_qpel16_v_lowpass_l2_neon | |
833 mov r4, lr | |
834 bl put_h264_qpel8_v_lowpass_l2_neon | |
835 sub r1, r1, r3, lsl #2 | |
836 bl put_h264_qpel8_v_lowpass_l2_neon | |
837 sub r0, r0, r3, lsl #4 | |
838 sub ip, ip, r2, lsl #4 | |
839 add r0, r0, #8 | |
840 add ip, ip, #8 | |
841 sub r1, r1, r3, lsl #4 | |
842 sub r1, r1, r3, lsl #2 | |
843 add r1, r1, #8 | |
844 bl put_h264_qpel8_v_lowpass_l2_neon | |
845 sub r1, r1, r3, lsl #2 | |
846 mov lr, r4 | |
847 .endfunc | |
848 | |
849 function put_h264_qpel8_v_lowpass_l2_neon | |
850 vld1.64 {d8}, [r1], r3 | |
851 vld1.64 {d10}, [r1], r3 | |
852 vld1.64 {d12}, [r1], r3 | |
853 vld1.64 {d14}, [r1], r3 | |
854 vld1.64 {d22}, [r1], r3 | |
855 vld1.64 {d24}, [r1], r3 | |
856 vld1.64 {d26}, [r1], r3 | |
857 vld1.64 {d28}, [r1], r3 | |
858 vld1.64 {d9}, [r1], r3 | |
859 vld1.64 {d11}, [r1], r3 | |
860 vld1.64 {d13}, [r1], r3 | |
861 vld1.64 {d15}, [r1], r3 | |
862 vld1.64 {d23}, [r1] | |
863 | |
864 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
865 lowpass_8 d8, d9, d10, d11, d8, d9 | |
866 lowpass_8 d12, d13, d14, d15, d12, d13 | |
867 lowpass_8 d22, d23, d24, d25, d22, d23 | |
868 lowpass_8 d26, d27, d28, d29, d26, d27 | |
869 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
870 | |
871 vld1.64 {d0}, [ip], r2 | |
872 vld1.64 {d1}, [ip], r2 | |
873 vld1.64 {d2}, [ip], r2 | |
874 vld1.64 {d3}, [ip], r2 | |
875 vld1.64 {d4}, [ip], r2 | |
876 vrhadd.u8 q0, q0, q4 | |
877 vld1.64 {d5}, [ip], r2 | |
878 vrhadd.u8 q1, q1, q6 | |
879 vld1.64 {d10}, [ip], r2 | |
880 vrhadd.u8 q2, q2, q11 | |
881 vld1.64 {d11}, [ip], r2 | |
882 | |
883 vst1.64 {d0}, [r0,:64], r3 | |
884 vst1.64 {d1}, [r0,:64], r3 | |
885 vrhadd.u8 q5, q5, q13 | |
886 vst1.64 {d2}, [r0,:64], r3 | |
887 vst1.64 {d3}, [r0,:64], r3 | |
888 vst1.64 {d4}, [r0,:64], r3 | |
889 vst1.64 {d5}, [r0,:64], r3 | |
890 vst1.64 {d10}, [r0,:64], r3 | |
891 vst1.64 {d11}, [r0,:64], r3 | |
892 | |
893 bx lr | |
894 .endfunc | |
895 | |
896 function put_h264_qpel8_hv_lowpass_neon_top | |
897 lowpass_const ip | |
898 mov ip, #12 | |
899 1: vld1.64 {d0, d1}, [r1], r3 | |
900 vld1.64 {d16,d17}, [r1], r3 | |
901 subs ip, ip, #2 | |
902 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
903 vst1.64 {d22-d25}, [r4,:128]! | |
904 bne 1b | |
905 | |
906 vld1.64 {d0, d1}, [r1] | |
907 lowpass_8_1 d0, d1, q12, narrow=0 | |
908 | |
909 mov ip, #-16 | |
910 add r4, r4, ip | |
911 vld1.64 {d30,d31}, [r4,:128], ip | |
912 vld1.64 {d20,d21}, [r4,:128], ip | |
913 vld1.64 {d18,d19}, [r4,:128], ip | |
914 vld1.64 {d16,d17}, [r4,:128], ip | |
915 vld1.64 {d14,d15}, [r4,:128], ip | |
916 vld1.64 {d12,d13}, [r4,:128], ip | |
917 vld1.64 {d10,d11}, [r4,:128], ip | |
918 vld1.64 {d8, d9}, [r4,:128], ip | |
919 vld1.64 {d6, d7}, [r4,:128], ip | |
920 vld1.64 {d4, d5}, [r4,:128], ip | |
921 vld1.64 {d2, d3}, [r4,:128], ip | |
922 vld1.64 {d0, d1}, [r4,:128] | |
923 | |
924 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
925 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
926 | |
927 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
928 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
929 | |
930 vst1.64 {d30,d31}, [r4,:128]! | |
931 vst1.64 {d6, d7}, [r4,:128]! | |
932 vst1.64 {d20,d21}, [r4,:128]! | |
933 vst1.64 {d4, d5}, [r4,:128]! | |
934 vst1.64 {d18,d19}, [r4,:128]! | |
935 vst1.64 {d2, d3}, [r4,:128]! | |
936 vst1.64 {d16,d17}, [r4,:128]! | |
937 vst1.64 {d0, d1}, [r4,:128] | |
938 | |
939 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
940 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
941 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
942 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
943 | |
944 vld1.64 {d16,d17}, [r4,:128], ip | |
945 vld1.64 {d30,d31}, [r4,:128], ip | |
946 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
947 vld1.64 {d16,d17}, [r4,:128], ip | |
948 vld1.64 {d30,d31}, [r4,:128], ip | |
949 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
950 vld1.64 {d16,d17}, [r4,:128], ip | |
951 vld1.64 {d30,d31}, [r4,:128], ip | |
952 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
953 vld1.64 {d16,d17}, [r4,:128], ip | |
954 vld1.64 {d30,d31}, [r4,:128] | |
955 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
956 | |
957 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
958 | |
959 bx lr | |
960 .endfunc | |
961 | |
962 function put_h264_qpel8_hv_lowpass_neon | |
963 mov r10, lr | |
964 bl put_h264_qpel8_hv_lowpass_neon_top | |
965 vst1.64 {d12}, [r0,:64], r2 | |
966 vst1.64 {d13}, [r0,:64], r2 | |
967 vst1.64 {d14}, [r0,:64], r2 | |
968 vst1.64 {d15}, [r0,:64], r2 | |
969 vst1.64 {d8}, [r0,:64], r2 | |
970 vst1.64 {d9}, [r0,:64], r2 | |
971 vst1.64 {d10}, [r0,:64], r2 | |
972 vst1.64 {d11}, [r0,:64], r2 | |
973 | |
974 mov lr, r10 | |
975 bx lr | |
976 .endfunc | |
977 | |
978 function put_h264_qpel8_hv_lowpass_l2_neon | |
979 mov r10, lr | |
980 bl put_h264_qpel8_hv_lowpass_neon_top | |
981 | |
982 vld1.64 {d0, d1}, [r2,:128]! | |
983 vld1.64 {d2, d3}, [r2,:128]! | |
984 vrhadd.u8 q0, q0, q6 | |
985 vld1.64 {d4, d5}, [r2,:128]! | |
986 vrhadd.u8 q1, q1, q7 | |
987 vld1.64 {d6, d7}, [r2,:128]! | |
988 vrhadd.u8 q2, q2, q4 | |
989 | |
990 vst1.64 {d0}, [r0,:64], r3 | |
991 vrhadd.u8 q3, q3, q5 | |
992 vst1.64 {d1}, [r0,:64], r3 | |
993 vst1.64 {d2}, [r0,:64], r3 | |
994 vst1.64 {d3}, [r0,:64], r3 | |
995 vst1.64 {d4}, [r0,:64], r3 | |
996 vst1.64 {d5}, [r0,:64], r3 | |
997 vst1.64 {d6}, [r0,:64], r3 | |
998 vst1.64 {d7}, [r0,:64], r3 | |
999 | |
1000 mov lr, r10 | |
1001 bx lr | |
1002 .endfunc | |
1003 | |
1004 function put_h264_qpel16_hv_lowpass_neon | |
1005 mov r9, lr | |
1006 bl put_h264_qpel8_hv_lowpass_neon | |
1007 sub r1, r1, r3, lsl #2 | |
1008 bl put_h264_qpel8_hv_lowpass_neon | |
1009 sub r1, r1, r3, lsl #4 | |
1010 sub r1, r1, r3, lsl #2 | |
1011 add r1, r1, #8 | |
1012 sub r0, r0, r2, lsl #4 | |
1013 add r0, r0, #8 | |
1014 bl put_h264_qpel8_hv_lowpass_neon | |
1015 sub r1, r1, r3, lsl #2 | |
1016 mov lr, r9 | |
1017 b put_h264_qpel8_hv_lowpass_neon | |
1018 .endfunc | |
1019 | |
1020 function put_h264_qpel16_hv_lowpass_l2_neon | |
1021 mov r9, lr | |
1022 sub r2, r4, #256 | |
1023 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1024 sub r1, r1, r3, lsl #2 | |
1025 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1026 sub r1, r1, r3, lsl #4 | |
1027 sub r1, r1, r3, lsl #2 | |
1028 add r1, r1, #8 | |
1029 sub r0, r0, r3, lsl #4 | |
1030 add r0, r0, #8 | |
1031 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1032 sub r1, r1, r3, lsl #2 | |
1033 mov lr, r9 | |
1034 b put_h264_qpel8_hv_lowpass_l2_neon | |
1035 .endfunc | |
1036 | |
1037 function ff_put_h264_qpel8_mc10_neon, export=1 | |
1038 lowpass_const r3 | |
1039 mov r3, r1 | |
1040 sub r1, r1, #2 | |
1041 mov ip, #8 | |
1042 b put_h264_qpel8_h_lowpass_l2_neon | |
1043 .endfunc | |
1044 | |
1045 function ff_put_h264_qpel8_mc20_neon, export=1 | |
1046 lowpass_const r3 | |
1047 sub r1, r1, #2 | |
1048 mov r3, r2 | |
1049 mov ip, #8 | |
1050 b put_h264_qpel8_h_lowpass_neon | |
1051 .endfunc | |
1052 | |
1053 function ff_put_h264_qpel8_mc30_neon, export=1 | |
1054 lowpass_const r3 | |
1055 add r3, r1, #1 | |
1056 sub r1, r1, #2 | |
1057 mov ip, #8 | |
1058 b put_h264_qpel8_h_lowpass_l2_neon | |
1059 .endfunc | |
1060 | |
1061 function ff_put_h264_qpel8_mc01_neon, export=1 | |
1062 push {lr} | |
1063 mov ip, r1 | |
1064 put_h264_qpel8_mc01: | |
1065 lowpass_const r3 | |
1066 mov r3, r2 | |
1067 sub r1, r1, r2, lsl #1 | |
1068 vpush {d8-d15} | |
1069 bl put_h264_qpel8_v_lowpass_l2_neon | |
1070 vpop {d8-d15} | |
1071 pop {pc} | |
1072 .endfunc | |
1073 | |
1074 function ff_put_h264_qpel8_mc11_neon, export=1 | |
1075 push {r0, r1, r2, lr} | |
1076 put_h264_qpel8_mc11: | |
1077 lowpass_const r3 | |
1078 sub sp, sp, #64 | |
1079 mov r0, sp | |
1080 sub r1, r1, #2 | |
1081 mov r3, #8 | |
1082 mov ip, #8 | |
1083 vpush {d8-d15} | |
1084 bl put_h264_qpel8_h_lowpass_neon | |
1085 ldrd r0, [sp, #128] | |
1086 mov r3, r2 | |
1087 add ip, sp, #64 | |
1088 sub r1, r1, r2, lsl #1 | |
1089 mov r2, #8 | |
1090 bl put_h264_qpel8_v_lowpass_l2_neon | |
1091 vpop {d8-d15} | |
1092 add sp, sp, #76 | |
1093 pop {pc} | |
1094 .endfunc | |
1095 | |
1096 function ff_put_h264_qpel8_mc21_neon, export=1 | |
1097 push {r0, r1, r4, r10, r11, lr} | |
1098 put_h264_qpel8_mc21: | |
1099 lowpass_const r3 | |
1100 mov r11, sp | |
1101 bic sp, sp, #15 | |
1102 sub sp, sp, #(8*8+16*12) | |
1103 sub r1, r1, #2 | |
1104 mov r3, #8 | |
1105 mov r0, sp | |
1106 mov ip, #8 | |
1107 vpush {d8-d15} | |
1108 bl put_h264_qpel8_h_lowpass_neon | |
1109 mov r4, r0 | |
1110 ldrd r0, [r11] | |
1111 sub r1, r1, r2, lsl #1 | |
1112 sub r1, r1, #2 | |
1113 mov r3, r2 | |
1114 sub r2, r4, #64 | |
1115 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1116 vpop {d8-d15} | |
1117 add sp, r11, #8 | |
1118 pop {r4, r10, r11, pc} | |
1119 .endfunc | |
1120 | |
1121 function ff_put_h264_qpel8_mc31_neon, export=1 | |
1122 add r1, r1, #1 | |
1123 push {r0, r1, r2, lr} | |
1124 sub r1, r1, #1 | |
1125 b put_h264_qpel8_mc11 | |
1126 .endfunc | |
1127 | |
1128 function ff_put_h264_qpel8_mc02_neon, export=1 | |
1129 push {lr} | |
1130 lowpass_const r3 | |
1131 sub r1, r1, r2, lsl #1 | |
1132 mov r3, r2 | |
1133 vpush {d8-d15} | |
1134 bl put_h264_qpel8_v_lowpass_neon | |
1135 vpop {d8-d15} | |
1136 pop {pc} | |
1137 .endfunc | |
1138 | |
1139 function ff_put_h264_qpel8_mc12_neon, export=1 | |
1140 push {r0, r1, r4, r10, r11, lr} | |
1141 put_h264_qpel8_mc12: | |
1142 lowpass_const r3 | |
1143 mov r11, sp | |
1144 bic sp, sp, #15 | |
1145 sub sp, sp, #(8*8+16*12) | |
1146 sub r1, r1, r2, lsl #1 | |
1147 mov r3, r2 | |
1148 mov r2, #8 | |
1149 mov r0, sp | |
1150 vpush {d8-d15} | |
1151 bl put_h264_qpel8_v_lowpass_neon | |
1152 mov r4, r0 | |
1153 ldrd r0, [r11] | |
1154 sub r1, r1, r3, lsl #1 | |
1155 sub r1, r1, #2 | |
1156 sub r2, r4, #64 | |
1157 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1158 vpop {d8-d15} | |
1159 add sp, r11, #8 | |
1160 pop {r4, r10, r11, pc} | |
1161 .endfunc | |
1162 | |
1163 function ff_put_h264_qpel8_mc22_neon, export=1 | |
1164 push {r4, r10, r11, lr} | |
1165 mov r11, sp | |
1166 bic sp, sp, #15 | |
1167 sub r1, r1, r2, lsl #1 | |
1168 sub r1, r1, #2 | |
1169 mov r3, r2 | |
1170 sub sp, sp, #(16*12) | |
1171 mov r4, sp | |
1172 vpush {d8-d15} | |
1173 bl put_h264_qpel8_hv_lowpass_neon | |
1174 vpop {d8-d15} | |
1175 mov sp, r11 | |
1176 pop {r4, r10, r11, pc} | |
1177 .endfunc | |
1178 | |
1179 function ff_put_h264_qpel8_mc32_neon, export=1 | |
1180 push {r0, r1, r4, r10, r11, lr} | |
1181 add r1, r1, #1 | |
1182 b put_h264_qpel8_mc12 | |
1183 .endfunc | |
1184 | |
1185 function ff_put_h264_qpel8_mc03_neon, export=1 | |
1186 push {lr} | |
1187 add ip, r1, r2 | |
1188 b put_h264_qpel8_mc01 | |
1189 .endfunc | |
1190 | |
1191 function ff_put_h264_qpel8_mc13_neon, export=1 | |
1192 push {r0, r1, r2, lr} | |
1193 add r1, r1, r2 | |
1194 b put_h264_qpel8_mc11 | |
1195 .endfunc | |
1196 | |
1197 function ff_put_h264_qpel8_mc23_neon, export=1 | |
1198 push {r0, r1, r4, r10, r11, lr} | |
1199 add r1, r1, r2 | |
1200 b put_h264_qpel8_mc21 | |
1201 .endfunc | |
1202 | |
1203 function ff_put_h264_qpel8_mc33_neon, export=1 | |
1204 add r1, r1, #1 | |
1205 push {r0, r1, r2, lr} | |
1206 add r1, r1, r2 | |
1207 sub r1, r1, #1 | |
1208 b put_h264_qpel8_mc11 | |
1209 .endfunc | |
1210 | |
1211 function ff_put_h264_qpel16_mc10_neon, export=1 | |
1212 lowpass_const r3 | |
1213 mov r3, r1 | |
1214 sub r1, r1, #2 | |
1215 b put_h264_qpel16_h_lowpass_l2_neon | |
1216 .endfunc | |
1217 | |
1218 function ff_put_h264_qpel16_mc20_neon, export=1 | |
1219 lowpass_const r3 | |
1220 sub r1, r1, #2 | |
1221 mov r3, r2 | |
1222 b put_h264_qpel16_h_lowpass_neon | |
1223 .endfunc | |
1224 | |
1225 function ff_put_h264_qpel16_mc30_neon, export=1 | |
1226 lowpass_const r3 | |
1227 add r3, r1, #1 | |
1228 sub r1, r1, #2 | |
1229 b put_h264_qpel16_h_lowpass_l2_neon | |
1230 .endfunc | |
1231 | |
1232 function ff_put_h264_qpel16_mc01_neon, export=1 | |
1233 push {r4, lr} | |
1234 mov ip, r1 | |
1235 put_h264_qpel16_mc01: | |
1236 lowpass_const r3 | |
1237 mov r3, r2 | |
1238 sub r1, r1, r2, lsl #1 | |
1239 vpush {d8-d15} | |
1240 bl put_h264_qpel16_v_lowpass_l2_neon | |
1241 vpop {d8-d15} | |
1242 pop {r4, pc} | |
1243 .endfunc | |
1244 | |
1245 function ff_put_h264_qpel16_mc11_neon, export=1 | |
1246 push {r0, r1, r4, lr} | |
1247 put_h264_qpel16_mc11: | |
1248 lowpass_const r3 | |
1249 sub sp, sp, #256 | |
1250 mov r0, sp | |
1251 sub r1, r1, #2 | |
1252 mov r3, #16 | |
1253 vpush {d8-d15} | |
1254 bl put_h264_qpel16_h_lowpass_neon | |
1255 add r0, sp, #256 | |
1256 ldrd r0, [r0, #64] | |
1257 mov r3, r2 | |
1258 add ip, sp, #64 | |
1259 sub r1, r1, r2, lsl #1 | |
1260 mov r2, #16 | |
1261 bl put_h264_qpel16_v_lowpass_l2_neon | |
1262 vpop {d8-d15} | |
1263 add sp, sp, #(256+8) | |
1264 pop {r4, pc} | |
1265 .endfunc | |
1266 | |
1267 function ff_put_h264_qpel16_mc21_neon, export=1 | |
1268 push {r0, r1, r4-r5, r9-r11, lr} | |
1269 put_h264_qpel16_mc21: | |
1270 lowpass_const r3 | |
1271 mov r11, sp | |
1272 bic sp, sp, #15 | |
1273 sub sp, sp, #(16*16+16*12) | |
1274 sub r1, r1, #2 | |
1275 mov r0, sp | |
1276 vpush {d8-d15} | |
1277 bl put_h264_qpel16_h_lowpass_neon_packed | |
1278 mov r4, r0 | |
1279 ldrd r0, [r11] | |
1280 sub r1, r1, r2, lsl #1 | |
1281 sub r1, r1, #2 | |
1282 mov r3, r2 | |
1283 bl put_h264_qpel16_hv_lowpass_l2_neon | |
1284 vpop {d8-d15} | |
1285 add sp, r11, #8 | |
1286 pop {r4-r5, r9-r11, pc} | |
1287 .endfunc | |
1288 | |
1289 function ff_put_h264_qpel16_mc31_neon, export=1 | |
1290 add r1, r1, #1 | |
1291 push {r0, r1, r4, lr} | |
1292 sub r1, r1, #1 | |
1293 b put_h264_qpel16_mc11 | |
1294 .endfunc | |
1295 | |
1296 function ff_put_h264_qpel16_mc02_neon, export=1 | |
1297 push {r4, lr} | |
1298 lowpass_const r3 | |
1299 sub r1, r1, r2, lsl #1 | |
1300 mov r3, r2 | |
1301 vpush {d8-d15} | |
1302 bl put_h264_qpel16_v_lowpass_neon | |
1303 vpop {d8-d15} | |
1304 pop {r4, pc} | |
1305 .endfunc | |
1306 | |
1307 function ff_put_h264_qpel16_mc12_neon, export=1 | |
1308 push {r0, r1, r4-r5, r9-r11, lr} | |
1309 put_h264_qpel16_mc12: | |
1310 lowpass_const r3 | |
1311 mov r11, sp | |
1312 bic sp, sp, #15 | |
1313 sub sp, sp, #(16*16+16*12) | |
1314 sub r1, r1, r2, lsl #1 | |
1315 mov r0, sp | |
1316 mov r3, r2 | |
1317 vpush {d8-d15} | |
1318 bl put_h264_qpel16_v_lowpass_neon_packed | |
1319 mov r4, r0 | |
1320 ldrd r0, [r11] | |
1321 sub r1, r1, r3, lsl #1 | |
1322 sub r1, r1, #2 | |
1323 mov r2, r3 | |
1324 bl put_h264_qpel16_hv_lowpass_l2_neon | |
1325 vpop {d8-d15} | |
1326 add sp, r11, #8 | |
1327 pop {r4-r5, r9-r11, pc} | |
1328 .endfunc | |
1329 | |
1330 function ff_put_h264_qpel16_mc22_neon, export=1 | |
1331 push {r4, r9-r11, lr} | |
1332 lowpass_const r3 | |
1333 mov r11, sp | |
1334 bic sp, sp, #15 | |
1335 sub r1, r1, r2, lsl #1 | |
1336 sub r1, r1, #2 | |
1337 mov r3, r2 | |
1338 sub sp, sp, #(16*12) | |
1339 mov r4, sp | |
1340 vpush {d8-d15} | |
1341 bl put_h264_qpel16_hv_lowpass_neon | |
1342 vpop {d8-d15} | |
1343 mov sp, r11 | |
1344 pop {r4, r9-r11, pc} | |
1345 .endfunc | |
1346 | |
1347 function ff_put_h264_qpel16_mc32_neon, export=1 | |
1348 push {r0, r1, r4-r5, r9-r11, lr} | |
1349 add r1, r1, #1 | |
1350 b put_h264_qpel16_mc12 | |
1351 .endfunc | |
1352 | |
1353 function ff_put_h264_qpel16_mc03_neon, export=1 | |
1354 push {r4, lr} | |
1355 add ip, r1, r2 | |
1356 b put_h264_qpel16_mc01 | |
1357 .endfunc | |
1358 | |
1359 function ff_put_h264_qpel16_mc13_neon, export=1 | |
1360 push {r0, r1, r4, lr} | |
1361 add r1, r1, r2 | |
1362 b put_h264_qpel16_mc11 | |
1363 .endfunc | |
1364 | |
1365 function ff_put_h264_qpel16_mc23_neon, export=1 | |
1366 push {r0, r1, r4-r5, r9-r11, lr} | |
1367 add r1, r1, r2 | |
1368 b put_h264_qpel16_mc21 | |
1369 .endfunc | |
1370 | |
1371 function ff_put_h264_qpel16_mc33_neon, export=1 | |
1372 add r1, r1, #1 | |
1373 push {r0, r1, r4, lr} | |
1374 add r1, r1, r2 | |
1375 sub r1, r1, #1 | |
1376 b put_h264_qpel16_mc11 | |
1377 .endfunc |