8336
|
1 /*
|
|
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
|
3 *
|
|
4 * This file is part of FFmpeg.
|
|
5 *
|
|
6 * FFmpeg is free software; you can redistribute it and/or
|
|
7 * modify it under the terms of the GNU Lesser General Public
|
|
8 * License as published by the Free Software Foundation; either
|
|
9 * version 2.1 of the License, or (at your option) any later version.
|
|
10 *
|
|
11 * FFmpeg is distributed in the hope that it will be useful,
|
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14 * Lesser General Public License for more details.
|
|
15 *
|
|
16 * You should have received a copy of the GNU Lesser General Public
|
|
17 * License along with FFmpeg; if not, write to the Free Software
|
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
19 */
|
|
20
|
|
21 #include "asm.S"
|
|
22
|
|
23 .fpu neon
|
|
24
|
8338
|
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
|
|
26 vtrn.32 \r0, \r4
|
|
27 vtrn.32 \r1, \r5
|
|
28 vtrn.32 \r2, \r6
|
|
29 vtrn.32 \r3, \r7
|
|
30 vtrn.16 \r0, \r2
|
|
31 vtrn.16 \r1, \r3
|
|
32 vtrn.16 \r4, \r6
|
|
33 vtrn.16 \r5, \r7
|
|
34 vtrn.8 \r0, \r1
|
|
35 vtrn.8 \r2, \r3
|
|
36 vtrn.8 \r4, \r5
|
|
37 vtrn.8 \r6, \r7
|
|
38 .endm
|
|
39
|
|
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
|
|
41 vswp \r0, \r4
|
|
42 vswp \r1, \r5
|
|
43 vswp \r2, \r6
|
|
44 vswp \r3, \r7
|
|
45 .endm
|
|
46
|
|
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
|
|
48 vtrn.32 \r0, \r2
|
|
49 vtrn.32 \r1, \r3
|
|
50 vtrn.32 \r4, \r6
|
|
51 vtrn.32 \r5, \r7
|
|
52 vtrn.16 \r0, \r1
|
|
53 vtrn.16 \r2, \r3
|
|
54 vtrn.16 \r4, \r5
|
|
55 vtrn.16 \r6, \r7
|
|
56 .endm
|
|
57
|
8336
|
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
|
|
59 .macro h264_chroma_mc8 avg=0
|
|
60 push {r4-r7, lr}
|
|
61 ldrd r4, [sp, #20]
|
|
62 .if \avg
|
|
63 mov lr, r0
|
|
64 .endif
|
|
65 pld [r1]
|
|
66 pld [r1, r2]
|
|
67
|
|
68 muls r7, r4, r5
|
|
69 rsb r6, r7, r5, lsl #3
|
|
70 rsb ip, r7, r4, lsl #3
|
|
71 sub r4, r7, r4, lsl #3
|
|
72 sub r4, r4, r5, lsl #3
|
|
73 add r4, r4, #64
|
|
74
|
|
75 beq 2f
|
|
76
|
|
77 add r5, r1, r2
|
|
78
|
|
79 vdup.8 d0, r4
|
|
80 lsl r4, r2, #1
|
|
81 vdup.8 d1, ip
|
|
82 vld1.64 {d4, d5}, [r1], r4
|
|
83 vdup.8 d2, r6
|
|
84 vld1.64 {d6, d7}, [r5], r4
|
|
85 vdup.8 d3, r7
|
|
86
|
|
87 vext.8 d5, d4, d5, #1
|
|
88 vext.8 d7, d6, d7, #1
|
|
89
|
|
90 1: pld [r5]
|
|
91 vmull.u8 q8, d4, d0
|
|
92 vmlal.u8 q8, d5, d1
|
|
93 vld1.64 {d4, d5}, [r1], r4
|
|
94 vmlal.u8 q8, d6, d2
|
|
95 vext.8 d5, d4, d5, #1
|
|
96 vmlal.u8 q8, d7, d3
|
|
97 vmull.u8 q9, d6, d0
|
|
98 subs r3, r3, #2
|
|
99 vmlal.u8 q9, d7, d1
|
|
100 vmlal.u8 q9, d4, d2
|
|
101 vmlal.u8 q9, d5, d3
|
|
102 vrshrn.u16 d16, q8, #6
|
|
103 vld1.64 {d6, d7}, [r5], r4
|
|
104 pld [r1]
|
|
105 vrshrn.u16 d17, q9, #6
|
|
106 .if \avg
|
|
107 vld1.64 {d20}, [lr,:64], r2
|
|
108 vld1.64 {d21}, [lr,:64], r2
|
|
109 vrhadd.u8 q8, q8, q10
|
|
110 .endif
|
|
111 vext.8 d7, d6, d7, #1
|
|
112 vst1.64 {d16}, [r0,:64], r2
|
|
113 vst1.64 {d17}, [r0,:64], r2
|
|
114 bgt 1b
|
|
115
|
|
116 pop {r4-r7, pc}
|
|
117
|
|
118 2: tst r6, r6
|
|
119 add ip, ip, r6
|
|
120 vdup.8 d0, r4
|
|
121 vdup.8 d1, ip
|
|
122
|
|
123 beq 4f
|
|
124
|
|
125 add r5, r1, r2
|
|
126 lsl r4, r2, #1
|
|
127 vld1.64 {d4}, [r1], r4
|
|
128 vld1.64 {d6}, [r5], r4
|
|
129
|
|
130 3: pld [r5]
|
|
131 vmull.u8 q8, d4, d0
|
|
132 vmlal.u8 q8, d6, d1
|
|
133 vld1.64 {d4}, [r1], r4
|
|
134 vmull.u8 q9, d6, d0
|
|
135 vmlal.u8 q9, d4, d1
|
|
136 vld1.64 {d6}, [r5], r4
|
|
137 vrshrn.u16 d16, q8, #6
|
|
138 vrshrn.u16 d17, q9, #6
|
|
139 .if \avg
|
|
140 vld1.64 {d20}, [lr,:64], r2
|
|
141 vld1.64 {d21}, [lr,:64], r2
|
|
142 vrhadd.u8 q8, q8, q10
|
|
143 .endif
|
|
144 subs r3, r3, #2
|
|
145 pld [r1]
|
|
146 vst1.64 {d16}, [r0,:64], r2
|
|
147 vst1.64 {d17}, [r0,:64], r2
|
|
148 bgt 3b
|
|
149
|
|
150 pop {r4-r7, pc}
|
|
151
|
|
152 4: vld1.64 {d4, d5}, [r1], r2
|
|
153 vld1.64 {d6, d7}, [r1], r2
|
|
154 vext.8 d5, d4, d5, #1
|
|
155 vext.8 d7, d6, d7, #1
|
|
156
|
|
157 5: pld [r1]
|
|
158 subs r3, r3, #2
|
|
159 vmull.u8 q8, d4, d0
|
|
160 vmlal.u8 q8, d5, d1
|
|
161 vld1.64 {d4, d5}, [r1], r2
|
|
162 vmull.u8 q9, d6, d0
|
|
163 vmlal.u8 q9, d7, d1
|
|
164 pld [r1]
|
|
165 vext.8 d5, d4, d5, #1
|
|
166 vrshrn.u16 d16, q8, #6
|
|
167 vrshrn.u16 d17, q9, #6
|
|
168 .if \avg
|
|
169 vld1.64 {d20}, [lr,:64], r2
|
|
170 vld1.64 {d21}, [lr,:64], r2
|
|
171 vrhadd.u8 q8, q8, q10
|
|
172 .endif
|
|
173 vld1.64 {d6, d7}, [r1], r2
|
|
174 vext.8 d7, d6, d7, #1
|
|
175 vst1.64 {d16}, [r0,:64], r2
|
|
176 vst1.64 {d17}, [r0,:64], r2
|
|
177 bgt 5b
|
|
178
|
|
179 pop {r4-r7, pc}
|
|
180 .endm
|
|
181
|
|
182 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
|
|
183 .macro h264_chroma_mc4 avg=0
|
|
184 push {r4-r7, lr}
|
|
185 ldrd r4, [sp, #20]
|
|
186 .if \avg
|
|
187 mov lr, r0
|
|
188 .endif
|
|
189 pld [r1]
|
|
190 pld [r1, r2]
|
|
191
|
|
192 muls r7, r4, r5
|
|
193 rsb r6, r7, r5, lsl #3
|
|
194 rsb ip, r7, r4, lsl #3
|
|
195 sub r4, r7, r4, lsl #3
|
|
196 sub r4, r4, r5, lsl #3
|
|
197 add r4, r4, #64
|
|
198
|
|
199 beq 2f
|
|
200
|
|
201 add r5, r1, r2
|
|
202
|
|
203 vdup.8 d0, r4
|
|
204 lsl r4, r2, #1
|
|
205 vdup.8 d1, ip
|
|
206 vld1.64 {d4}, [r1], r4
|
|
207 vdup.8 d2, r6
|
|
208 vld1.64 {d6}, [r5], r4
|
|
209 vdup.8 d3, r7
|
|
210
|
|
211 vext.8 d5, d4, d5, #1
|
|
212 vext.8 d7, d6, d7, #1
|
|
213 vtrn.32 d4, d5
|
|
214 vtrn.32 d6, d7
|
|
215
|
|
216 vtrn.32 d0, d1
|
|
217 vtrn.32 d2, d3
|
|
218
|
|
219 1: pld [r5]
|
|
220 vmull.u8 q8, d4, d0
|
|
221 vmlal.u8 q8, d6, d2
|
|
222 vld1.64 {d4}, [r1], r4
|
|
223 vext.8 d5, d4, d5, #1
|
|
224 vtrn.32 d4, d5
|
|
225 vmull.u8 q9, d6, d0
|
|
226 vmlal.u8 q9, d4, d2
|
|
227 vld1.64 {d6}, [r5], r4
|
|
228 vadd.i16 d16, d16, d17
|
|
229 vadd.i16 d17, d18, d19
|
|
230 vrshrn.u16 d16, q8, #6
|
|
231 subs r3, r3, #2
|
|
232 pld [r1]
|
|
233 .if \avg
|
|
234 vld1.32 {d20[0]}, [lr,:32], r2
|
|
235 vld1.32 {d20[1]}, [lr,:32], r2
|
|
236 vrhadd.u8 d16, d16, d20
|
|
237 .endif
|
|
238 vext.8 d7, d6, d7, #1
|
|
239 vtrn.32 d6, d7
|
|
240 vst1.32 {d16[0]}, [r0,:32], r2
|
|
241 vst1.32 {d16[1]}, [r0,:32], r2
|
|
242 bgt 1b
|
|
243
|
|
244 pop {r4-r7, pc}
|
|
245
|
|
246 2: tst r6, r6
|
|
247 add ip, ip, r6
|
|
248 vdup.8 d0, r4
|
|
249 vdup.8 d1, ip
|
|
250 vtrn.32 d0, d1
|
|
251
|
|
252 beq 4f
|
|
253
|
|
254 vext.32 d1, d0, d1, #1
|
|
255 add r5, r1, r2
|
|
256 lsl r4, r2, #1
|
|
257 vld1.32 {d4[0]}, [r1], r4
|
|
258 vld1.32 {d4[1]}, [r5], r4
|
|
259
|
|
260 3: pld [r5]
|
|
261 vmull.u8 q8, d4, d0
|
|
262 vld1.32 {d4[0]}, [r1], r4
|
|
263 vmull.u8 q9, d4, d1
|
|
264 vld1.32 {d4[1]}, [r5], r4
|
|
265 vadd.i16 d16, d16, d17
|
|
266 vadd.i16 d17, d18, d19
|
|
267 vrshrn.u16 d16, q8, #6
|
|
268 .if \avg
|
|
269 vld1.32 {d20[0]}, [lr,:32], r2
|
|
270 vld1.32 {d20[1]}, [lr,:32], r2
|
|
271 vrhadd.u8 d16, d16, d20
|
|
272 .endif
|
|
273 subs r3, r3, #2
|
|
274 pld [r1]
|
|
275 vst1.32 {d16[0]}, [r0,:32], r2
|
|
276 vst1.32 {d16[1]}, [r0,:32], r2
|
|
277 bgt 3b
|
|
278
|
|
279 pop {r4-r7, pc}
|
|
280
|
|
281 4: vld1.64 {d4}, [r1], r2
|
|
282 vld1.64 {d6}, [r1], r2
|
|
283 vext.8 d5, d4, d5, #1
|
|
284 vext.8 d7, d6, d7, #1
|
|
285 vtrn.32 d4, d5
|
|
286 vtrn.32 d6, d7
|
|
287
|
|
288 5: vmull.u8 q8, d4, d0
|
|
289 vmull.u8 q9, d6, d0
|
|
290 subs r3, r3, #2
|
|
291 vld1.64 {d4}, [r1], r2
|
|
292 vext.8 d5, d4, d5, #1
|
|
293 vtrn.32 d4, d5
|
|
294 vadd.i16 d16, d16, d17
|
|
295 vadd.i16 d17, d18, d19
|
|
296 pld [r1]
|
|
297 vrshrn.u16 d16, q8, #6
|
|
298 .if \avg
|
|
299 vld1.32 {d20[0]}, [lr,:32], r2
|
|
300 vld1.32 {d20[1]}, [lr,:32], r2
|
|
301 vrhadd.u8 d16, d16, d20
|
|
302 .endif
|
|
303 vld1.64 {d6}, [r1], r2
|
|
304 vext.8 d7, d6, d7, #1
|
|
305 vtrn.32 d6, d7
|
|
306 pld [r1]
|
|
307 vst1.32 {d16[0]}, [r0,:32], r2
|
|
308 vst1.32 {d16[1]}, [r0,:32], r2
|
|
309 bgt 5b
|
|
310
|
|
311 pop {r4-r7, pc}
|
|
312 .endm
|
|
313
|
|
314 .text
|
|
315 .align
|
|
316
|
|
317 function ff_put_h264_chroma_mc8_neon, export=1
|
|
318 h264_chroma_mc8
|
|
319 .endfunc
|
|
320
|
|
321 function ff_avg_h264_chroma_mc8_neon, export=1
|
|
322 h264_chroma_mc8 avg=1
|
|
323 .endfunc
|
|
324
|
|
325 function ff_put_h264_chroma_mc4_neon, export=1
|
|
326 h264_chroma_mc4
|
|
327 .endfunc
|
|
328
|
|
329 function ff_avg_h264_chroma_mc4_neon, export=1
|
|
330 h264_chroma_mc4 avg=1
|
|
331 .endfunc
|
8337
|
332
|
|
333 /* H.264 loop filter */
|
|
334
|
|
335 .macro h264_loop_filter_start
|
|
336 ldr ip, [sp]
|
|
337 tst r2, r2
|
|
338 ldr ip, [ip]
|
|
339 tstne r3, r3
|
|
340 vmov.32 d24[0], ip
|
|
341 and ip, ip, ip, lsl #16
|
|
342 bxeq lr
|
|
343 ands ip, ip, ip, lsl #8
|
|
344 bxlt lr
|
|
345 .endm
|
|
346
|
|
347 .macro align_push_regs
|
|
348 and ip, sp, #15
|
|
349 add ip, ip, #32
|
|
350 sub sp, sp, ip
|
|
351 vst1.64 {d12-d15}, [sp,:128]
|
|
352 sub sp, sp, #32
|
|
353 vst1.64 {d8-d11}, [sp,:128]
|
|
354 .endm
|
|
355
|
|
356 .macro align_pop_regs
|
|
357 vld1.64 {d8-d11}, [sp,:128]!
|
|
358 vld1.64 {d12-d15}, [sp,:128], ip
|
|
359 .endm
|
|
360
|
|
361 .macro h264_loop_filter_luma
|
|
362 vdup.8 q11, r2 @ alpha
|
|
363 vmovl.u8 q12, d24
|
|
364 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
|
|
365 vmovl.u16 q12, d24
|
|
366 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
|
367 vsli.16 q12, q12, #8
|
|
368 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
|
369 vsli.32 q12, q12, #16
|
|
370 vclt.u8 q6, q6, q11 @ < alpha
|
|
371 vdup.8 q11, r3 @ beta
|
|
372 vclt.s8 q7, q12, #0
|
|
373 vclt.u8 q14, q14, q11 @ < beta
|
|
374 vclt.u8 q15, q15, q11 @ < beta
|
|
375 vbic q6, q6, q7
|
|
376 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
|
|
377 vand q6, q6, q14
|
|
378 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
|
379 vclt.u8 q4, q4, q11 @ < beta
|
|
380 vand q6, q6, q15
|
|
381 vclt.u8 q5, q5, q11 @ < beta
|
|
382 vand q4, q4, q6
|
|
383 vand q5, q5, q6
|
|
384 vand q12, q12, q6
|
|
385 vrhadd.u8 q14, q8, q0
|
|
386 vsub.i8 q6, q12, q4
|
|
387 vqadd.u8 q7, q9, q12
|
|
388 vhadd.u8 q10, q10, q14
|
|
389 vsub.i8 q6, q6, q5
|
|
390 vhadd.u8 q14, q2, q14
|
|
391 vmin.u8 q7, q7, q10
|
|
392 vqsub.u8 q11, q9, q12
|
|
393 vqadd.u8 q2, q1, q12
|
|
394 vmax.u8 q7, q7, q11
|
|
395 vqsub.u8 q11, q1, q12
|
|
396 vmin.u8 q14, q2, q14
|
|
397 vmovl.u8 q2, d0
|
|
398 vmax.u8 q14, q14, q11
|
|
399 vmovl.u8 q10, d1
|
|
400 vsubw.u8 q2, q2, d16
|
|
401 vsubw.u8 q10, q10, d17
|
|
402 vshl.i16 q2, q2, #2
|
|
403 vshl.i16 q10, q10, #2
|
|
404 vaddw.u8 q2, q2, d18
|
|
405 vaddw.u8 q10, q10, d19
|
|
406 vsubw.u8 q2, q2, d2
|
|
407 vsubw.u8 q10, q10, d3
|
|
408 vrshrn.i16 d4, q2, #3
|
|
409 vrshrn.i16 d5, q10, #3
|
|
410 vbsl q4, q7, q9
|
|
411 vbsl q5, q14, q1
|
|
412 vneg.s8 q7, q6
|
|
413 vmovl.u8 q14, d16
|
|
414 vmin.s8 q2, q2, q6
|
|
415 vmovl.u8 q6, d17
|
|
416 vmax.s8 q2, q2, q7
|
|
417 vmovl.u8 q11, d0
|
|
418 vmovl.u8 q12, d1
|
|
419 vaddw.s8 q14, q14, d4
|
|
420 vaddw.s8 q6, q6, d5
|
|
421 vsubw.s8 q11, q11, d4
|
|
422 vsubw.s8 q12, q12, d5
|
|
423 vqmovun.s16 d16, q14
|
|
424 vqmovun.s16 d17, q6
|
|
425 vqmovun.s16 d0, q11
|
|
426 vqmovun.s16 d1, q12
|
|
427 .endm
|
|
428
|
|
429 function ff_h264_v_loop_filter_luma_neon, export=1
|
|
430 h264_loop_filter_start
|
|
431
|
|
432 vld1.64 {d0, d1}, [r0,:128], r1
|
|
433 vld1.64 {d2, d3}, [r0,:128], r1
|
|
434 vld1.64 {d4, d5}, [r0,:128], r1
|
|
435 sub r0, r0, r1, lsl #2
|
|
436 sub r0, r0, r1, lsl #1
|
|
437 vld1.64 {d20,d21}, [r0,:128], r1
|
|
438 vld1.64 {d18,d19}, [r0,:128], r1
|
|
439 vld1.64 {d16,d17}, [r0,:128], r1
|
|
440
|
|
441 align_push_regs
|
|
442
|
|
443 h264_loop_filter_luma
|
|
444
|
|
445 sub r0, r0, r1, lsl #1
|
|
446 vst1.64 {d8, d9}, [r0,:128], r1
|
|
447 vst1.64 {d16,d17}, [r0,:128], r1
|
|
448 vst1.64 {d0, d1}, [r0,:128], r1
|
|
449 vst1.64 {d10,d11}, [r0,:128]
|
|
450
|
|
451 align_pop_regs
|
|
452 bx lr
|
|
453 .endfunc
|
|
454
|
|
455 function ff_h264_h_loop_filter_luma_neon, export=1
|
|
456 h264_loop_filter_start
|
|
457
|
|
458 sub r0, r0, #4
|
|
459 vld1.64 {d6}, [r0], r1
|
|
460 vld1.64 {d20}, [r0], r1
|
|
461 vld1.64 {d18}, [r0], r1
|
|
462 vld1.64 {d16}, [r0], r1
|
|
463 vld1.64 {d0}, [r0], r1
|
|
464 vld1.64 {d2}, [r0], r1
|
|
465 vld1.64 {d4}, [r0], r1
|
|
466 vld1.64 {d26}, [r0], r1
|
|
467 vld1.64 {d7}, [r0], r1
|
|
468 vld1.64 {d21}, [r0], r1
|
|
469 vld1.64 {d19}, [r0], r1
|
|
470 vld1.64 {d17}, [r0], r1
|
|
471 vld1.64 {d1}, [r0], r1
|
|
472 vld1.64 {d3}, [r0], r1
|
|
473 vld1.64 {d5}, [r0], r1
|
|
474 vld1.64 {d27}, [r0], r1
|
|
475
|
8338
|
476 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
|
8337
|
477
|
|
478 align_push_regs
|
|
479 sub sp, sp, #16
|
|
480 vst1.64 {d4, d5}, [sp,:128]
|
|
481 sub sp, sp, #16
|
|
482 vst1.64 {d20,d21}, [sp,:128]
|
|
483
|
|
484 h264_loop_filter_luma
|
|
485
|
|
486 vld1.64 {d20,d21}, [sp,:128]!
|
|
487 vld1.64 {d4, d5}, [sp,:128]!
|
|
488
|
8338
|
489 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
|
8337
|
490
|
|
491 sub r0, r0, r1, lsl #4
|
|
492 vst1.64 {d6}, [r0], r1
|
|
493 vst1.64 {d20}, [r0], r1
|
|
494 vst1.64 {d8}, [r0], r1
|
|
495 vst1.64 {d16}, [r0], r1
|
|
496 vst1.64 {d0}, [r0], r1
|
|
497 vst1.64 {d10}, [r0], r1
|
|
498 vst1.64 {d4}, [r0], r1
|
|
499 vst1.64 {d26}, [r0], r1
|
|
500 vst1.64 {d7}, [r0], r1
|
|
501 vst1.64 {d21}, [r0], r1
|
|
502 vst1.64 {d9}, [r0], r1
|
|
503 vst1.64 {d17}, [r0], r1
|
|
504 vst1.64 {d1}, [r0], r1
|
|
505 vst1.64 {d11}, [r0], r1
|
|
506 vst1.64 {d5}, [r0], r1
|
|
507 vst1.64 {d27}, [r0], r1
|
|
508
|
|
509 align_pop_regs
|
|
510 bx lr
|
|
511 .endfunc
|
|
512
|
|
513 .macro h264_loop_filter_chroma
|
|
514 vdup.8 d22, r2 @ alpha
|
|
515 vmovl.u8 q12, d24
|
|
516 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
|
|
517 vmovl.u8 q2, d0
|
|
518 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
|
|
519 vsubw.u8 q2, q2, d16
|
|
520 vsli.16 d24, d24, #8
|
|
521 vshl.i16 q2, q2, #2
|
|
522 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
|
|
523 vaddw.u8 q2, q2, d18
|
|
524 vclt.u8 d26, d26, d22 @ < alpha
|
|
525 vsubw.u8 q2, q2, d2
|
|
526 vdup.8 d22, r3 @ beta
|
|
527 vclt.s8 d25, d24, #0
|
|
528 vrshrn.i16 d4, q2, #3
|
|
529 vclt.u8 d28, d28, d22 @ < beta
|
|
530 vbic d26, d26, d25
|
|
531 vclt.u8 d30, d30, d22 @ < beta
|
|
532 vand d26, d26, d28
|
|
533 vneg.s8 d25, d24
|
|
534 vand d26, d26, d30
|
|
535 vmin.s8 d4, d4, d24
|
|
536 vmovl.u8 q14, d16
|
|
537 vand d4, d4, d26
|
|
538 vmax.s8 d4, d4, d25
|
|
539 vmovl.u8 q11, d0
|
|
540 vaddw.s8 q14, q14, d4
|
|
541 vsubw.s8 q11, q11, d4
|
|
542 vqmovun.s16 d16, q14
|
|
543 vqmovun.s16 d0, q11
|
|
544 .endm
|
|
545
|
|
546 function ff_h264_v_loop_filter_chroma_neon, export=1
|
|
547 h264_loop_filter_start
|
|
548
|
|
549 sub r0, r0, r1, lsl #1
|
|
550 vld1.64 {d18}, [r0,:64], r1
|
|
551 vld1.64 {d16}, [r0,:64], r1
|
|
552 vld1.64 {d0}, [r0,:64], r1
|
|
553 vld1.64 {d2}, [r0,:64]
|
|
554
|
|
555 h264_loop_filter_chroma
|
|
556
|
|
557 sub r0, r0, r1, lsl #1
|
|
558 vst1.64 {d16}, [r0,:64], r1
|
|
559 vst1.64 {d0}, [r0,:64], r1
|
|
560
|
|
561 bx lr
|
|
562 .endfunc
|
|
563
|
|
564 function ff_h264_h_loop_filter_chroma_neon, export=1
|
|
565 h264_loop_filter_start
|
|
566
|
|
567 sub r0, r0, #2
|
|
568 vld1.32 {d18[0]}, [r0], r1
|
|
569 vld1.32 {d16[0]}, [r0], r1
|
|
570 vld1.32 {d0[0]}, [r0], r1
|
|
571 vld1.32 {d2[0]}, [r0], r1
|
|
572 vld1.32 {d18[1]}, [r0], r1
|
|
573 vld1.32 {d16[1]}, [r0], r1
|
|
574 vld1.32 {d0[1]}, [r0], r1
|
|
575 vld1.32 {d2[1]}, [r0], r1
|
|
576
|
|
577 vtrn.16 d18, d0
|
|
578 vtrn.16 d16, d2
|
|
579 vtrn.8 d18, d16
|
|
580 vtrn.8 d0, d2
|
|
581
|
|
582 h264_loop_filter_chroma
|
|
583
|
|
584 vtrn.16 d18, d0
|
|
585 vtrn.16 d16, d2
|
|
586 vtrn.8 d18, d16
|
|
587 vtrn.8 d0, d2
|
|
588
|
|
589 sub r0, r0, r1, lsl #3
|
|
590 vst1.32 {d18[0]}, [r0], r1
|
|
591 vst1.32 {d16[0]}, [r0], r1
|
|
592 vst1.32 {d0[0]}, [r0], r1
|
|
593 vst1.32 {d2[0]}, [r0], r1
|
|
594 vst1.32 {d18[1]}, [r0], r1
|
|
595 vst1.32 {d16[1]}, [r0], r1
|
|
596 vst1.32 {d0[1]}, [r0], r1
|
|
597 vst1.32 {d2[1]}, [r0], r1
|
|
598
|
|
599 bx lr
|
|
600 .endfunc
|
8338
|
601
|
|
602 /* H.264 qpel MC */
|
|
603
|
|
604 .macro lowpass_const r
|
|
605 movw \r, #5
|
|
606 movt \r, #20
|
|
607 vmov.32 d6[0], \r
|
|
608 .endm
|
|
609
|
|
610 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
|
|
611 .if \narrow
|
|
612 t0 .req q0
|
|
613 t1 .req q8
|
|
614 .else
|
|
615 t0 .req \d0
|
|
616 t1 .req \d1
|
|
617 .endif
|
|
618 vext.8 d2, \r0, \r1, #2
|
|
619 vext.8 d3, \r0, \r1, #3
|
|
620 vaddl.u8 q1, d2, d3
|
|
621 vext.8 d4, \r0, \r1, #1
|
|
622 vext.8 d5, \r0, \r1, #4
|
|
623 vaddl.u8 q2, d4, d5
|
|
624 vext.8 d30, \r0, \r1, #5
|
|
625 vaddl.u8 t0, \r0, d30
|
|
626 vext.8 d18, \r2, \r3, #2
|
|
627 vmla.i16 t0, q1, d6[1]
|
|
628 vext.8 d19, \r2, \r3, #3
|
|
629 vaddl.u8 q9, d18, d19
|
|
630 vext.8 d20, \r2, \r3, #1
|
|
631 vmls.i16 t0, q2, d6[0]
|
|
632 vext.8 d21, \r2, \r3, #4
|
|
633 vaddl.u8 q10, d20, d21
|
|
634 vext.8 d31, \r2, \r3, #5
|
|
635 vaddl.u8 t1, \r2, d31
|
|
636 vmla.i16 t1, q9, d6[1]
|
|
637 vmls.i16 t1, q10, d6[0]
|
|
638 .if \narrow
|
|
639 vqrshrun.s16 \d0, t0, #5
|
|
640 vqrshrun.s16 \d1, t1, #5
|
|
641 .endif
|
|
642 .unreq t0
|
|
643 .unreq t1
|
|
644 .endm
|
|
645
|
|
646 .macro lowpass_8_1 r0, r1, d0, narrow=1
|
|
647 .if \narrow
|
|
648 t0 .req q0
|
|
649 .else
|
|
650 t0 .req \d0
|
|
651 .endif
|
|
652 vext.8 d2, \r0, \r1, #2
|
|
653 vext.8 d3, \r0, \r1, #3
|
|
654 vaddl.u8 q1, d2, d3
|
|
655 vext.8 d4, \r0, \r1, #1
|
|
656 vext.8 d5, \r0, \r1, #4
|
|
657 vaddl.u8 q2, d4, d5
|
|
658 vext.8 d30, \r0, \r1, #5
|
|
659 vaddl.u8 t0, \r0, d30
|
|
660 vmla.i16 t0, q1, d6[1]
|
|
661 vmls.i16 t0, q2, d6[0]
|
|
662 .if \narrow
|
|
663 vqrshrun.s16 \d0, t0, #5
|
|
664 .endif
|
|
665 .unreq t0
|
|
666 .endm
|
|
667
|
|
668 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
|
|
669 vext.16 q1, \r0, \r1, #2
|
|
670 vext.16 q0, \r0, \r1, #3
|
|
671 vaddl.s16 q9, d2, d0
|
|
672 vext.16 q2, \r0, \r1, #1
|
|
673 vaddl.s16 q1, d3, d1
|
|
674 vext.16 q3, \r0, \r1, #4
|
|
675 vaddl.s16 q10, d4, d6
|
|
676 vext.16 \r1, \r0, \r1, #5
|
|
677 vaddl.s16 q2, d5, d7
|
|
678 vaddl.s16 q0, \h0, \h1
|
|
679 vaddl.s16 q8, \l0, \l1
|
|
680
|
|
681 vshl.i32 q3, q9, #4
|
|
682 vshl.i32 q9, q9, #2
|
|
683 vshl.i32 q15, q10, #2
|
|
684 vadd.i32 q9, q9, q3
|
|
685 vadd.i32 q10, q10, q15
|
|
686
|
|
687 vshl.i32 q3, q1, #4
|
|
688 vshl.i32 q1, q1, #2
|
|
689 vshl.i32 q15, q2, #2
|
|
690 vadd.i32 q1, q1, q3
|
|
691 vadd.i32 q2, q2, q15
|
|
692
|
|
693 vadd.i32 q9, q9, q8
|
|
694 vsub.i32 q9, q9, q10
|
|
695
|
|
696 vadd.i32 q1, q1, q0
|
|
697 vsub.i32 q1, q1, q2
|
|
698
|
|
699 vrshrn.s32 d18, q9, #10
|
|
700 vrshrn.s32 d19, q1, #10
|
|
701
|
|
702 vqmovun.s16 \d, q9
|
|
703 .endm
|
|
704
|
|
705 function put_h264_qpel16_h_lowpass_neon_packed
|
|
706 mov r4, lr
|
|
707 mov ip, #16
|
|
708 mov r3, #8
|
|
709 bl put_h264_qpel8_h_lowpass_neon
|
|
710 sub r1, r1, r2, lsl #4
|
|
711 add r1, r1, #8
|
|
712 mov ip, #16
|
|
713 mov lr, r4
|
|
714 b put_h264_qpel8_h_lowpass_neon
|
|
715 .endfunc
|
|
716
|
|
717 function put_h264_qpel16_h_lowpass_neon
|
|
718 push {lr}
|
|
719 mov ip, #16
|
|
720 bl put_h264_qpel8_h_lowpass_neon
|
|
721 sub r0, r0, r3, lsl #4
|
|
722 sub r1, r1, r2, lsl #4
|
|
723 add r0, r0, #8
|
|
724 add r1, r1, #8
|
|
725 mov ip, #16
|
|
726 pop {lr}
|
|
727 .endfunc
|
|
728
|
|
729 function put_h264_qpel8_h_lowpass_neon
|
|
730 1: vld1.64 {d0, d1}, [r1], r2
|
|
731 vld1.64 {d16,d17}, [r1], r2
|
|
732 subs ip, ip, #2
|
|
733 lowpass_8 d0, d1, d16, d17, d0, d16
|
|
734 vst1.64 {d0}, [r0,:64], r3
|
|
735 vst1.64 {d16}, [r0,:64], r3
|
|
736 bne 1b
|
|
737 bx lr
|
|
738 .endfunc
|
|
739
|
|
740 function put_h264_qpel16_h_lowpass_l2_neon
|
|
741 push {lr}
|
|
742 mov ip, #16
|
|
743 bl put_h264_qpel8_h_lowpass_l2_neon
|
|
744 sub r0, r0, r2, lsl #4
|
|
745 sub r1, r1, r2, lsl #4
|
|
746 sub r3, r3, r2, lsl #4
|
|
747 add r0, r0, #8
|
|
748 add r1, r1, #8
|
|
749 add r3, r3, #8
|
|
750 mov ip, #16
|
|
751 pop {lr}
|
|
752 .endfunc
|
|
753
|
|
754 function put_h264_qpel8_h_lowpass_l2_neon
|
|
755 1: vld1.64 {d0, d1}, [r1], r2
|
|
756 vld1.64 {d16,d17}, [r1], r2
|
|
757 vld1.64 {d28}, [r3], r2
|
|
758 vld1.64 {d29}, [r3], r2
|
|
759 subs ip, ip, #2
|
|
760 lowpass_8 d0, d1, d16, d17, d0, d1
|
|
761 vrhadd.u8 q0, q0, q14
|
|
762 vst1.64 {d0}, [r0,:64], r2
|
|
763 vst1.64 {d1}, [r0,:64], r2
|
|
764 bne 1b
|
|
765 bx lr
|
|
766 .endfunc
|
|
767
|
|
768 function put_h264_qpel16_v_lowpass_neon_packed
|
|
769 mov r4, lr
|
|
770 mov r2, #8
|
|
771 bl put_h264_qpel8_v_lowpass_neon
|
|
772 sub r1, r1, r3, lsl #2
|
|
773 bl put_h264_qpel8_v_lowpass_neon
|
|
774 sub r1, r1, r3, lsl #4
|
|
775 sub r1, r1, r3, lsl #2
|
|
776 add r1, r1, #8
|
|
777 bl put_h264_qpel8_v_lowpass_neon
|
|
778 sub r1, r1, r3, lsl #2
|
|
779 mov lr, r4
|
|
780 b put_h264_qpel8_v_lowpass_neon
|
|
781 .endfunc
|
|
782
|
|
783 function put_h264_qpel16_v_lowpass_neon
|
|
784 mov r4, lr
|
|
785 bl put_h264_qpel8_v_lowpass_neon
|
|
786 sub r1, r1, r3, lsl #2
|
|
787 bl put_h264_qpel8_v_lowpass_neon
|
|
788 sub r0, r0, r2, lsl #4
|
|
789 add r0, r0, #8
|
|
790 sub r1, r1, r3, lsl #4
|
|
791 sub r1, r1, r3, lsl #2
|
|
792 add r1, r1, #8
|
|
793 bl put_h264_qpel8_v_lowpass_neon
|
|
794 sub r1, r1, r3, lsl #2
|
|
795 mov lr, r4
|
|
796 .endfunc
|
|
797
|
|
798 function put_h264_qpel8_v_lowpass_neon
|
|
799 vld1.64 {d8}, [r1], r3
|
|
800 vld1.64 {d10}, [r1], r3
|
|
801 vld1.64 {d12}, [r1], r3
|
|
802 vld1.64 {d14}, [r1], r3
|
|
803 vld1.64 {d22}, [r1], r3
|
|
804 vld1.64 {d24}, [r1], r3
|
|
805 vld1.64 {d26}, [r1], r3
|
|
806 vld1.64 {d28}, [r1], r3
|
|
807 vld1.64 {d9}, [r1], r3
|
|
808 vld1.64 {d11}, [r1], r3
|
|
809 vld1.64 {d13}, [r1], r3
|
|
810 vld1.64 {d15}, [r1], r3
|
|
811 vld1.64 {d23}, [r1]
|
|
812
|
|
813 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
|
|
814 lowpass_8 d8, d9, d10, d11, d8, d10
|
|
815 lowpass_8 d12, d13, d14, d15, d12, d14
|
|
816 lowpass_8 d22, d23, d24, d25, d22, d24
|
|
817 lowpass_8 d26, d27, d28, d29, d26, d28
|
|
818 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
|
|
819
|
|
820 vst1.64 {d8}, [r0,:64], r2
|
|
821 vst1.64 {d10}, [r0,:64], r2
|
|
822 vst1.64 {d12}, [r0,:64], r2
|
|
823 vst1.64 {d14}, [r0,:64], r2
|
|
824 vst1.64 {d22}, [r0,:64], r2
|
|
825 vst1.64 {d24}, [r0,:64], r2
|
|
826 vst1.64 {d26}, [r0,:64], r2
|
|
827 vst1.64 {d28}, [r0,:64], r2
|
|
828
|
|
829 bx lr
|
|
830 .endfunc
|
|
831
|
|
832 function put_h264_qpel16_v_lowpass_l2_neon
|
|
833 mov r4, lr
|
|
834 bl put_h264_qpel8_v_lowpass_l2_neon
|
|
835 sub r1, r1, r3, lsl #2
|
|
836 bl put_h264_qpel8_v_lowpass_l2_neon
|
|
837 sub r0, r0, r3, lsl #4
|
|
838 sub ip, ip, r2, lsl #4
|
|
839 add r0, r0, #8
|
|
840 add ip, ip, #8
|
|
841 sub r1, r1, r3, lsl #4
|
|
842 sub r1, r1, r3, lsl #2
|
|
843 add r1, r1, #8
|
|
844 bl put_h264_qpel8_v_lowpass_l2_neon
|
|
845 sub r1, r1, r3, lsl #2
|
|
846 mov lr, r4
|
|
847 .endfunc
|
|
848
|
|
849 function put_h264_qpel8_v_lowpass_l2_neon
|
|
850 vld1.64 {d8}, [r1], r3
|
|
851 vld1.64 {d10}, [r1], r3
|
|
852 vld1.64 {d12}, [r1], r3
|
|
853 vld1.64 {d14}, [r1], r3
|
|
854 vld1.64 {d22}, [r1], r3
|
|
855 vld1.64 {d24}, [r1], r3
|
|
856 vld1.64 {d26}, [r1], r3
|
|
857 vld1.64 {d28}, [r1], r3
|
|
858 vld1.64 {d9}, [r1], r3
|
|
859 vld1.64 {d11}, [r1], r3
|
|
860 vld1.64 {d13}, [r1], r3
|
|
861 vld1.64 {d15}, [r1], r3
|
|
862 vld1.64 {d23}, [r1]
|
|
863
|
|
864 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
|
|
865 lowpass_8 d8, d9, d10, d11, d8, d9
|
|
866 lowpass_8 d12, d13, d14, d15, d12, d13
|
|
867 lowpass_8 d22, d23, d24, d25, d22, d23
|
|
868 lowpass_8 d26, d27, d28, d29, d26, d27
|
|
869 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
|
|
870
|
|
871 vld1.64 {d0}, [ip], r2
|
|
872 vld1.64 {d1}, [ip], r2
|
|
873 vld1.64 {d2}, [ip], r2
|
|
874 vld1.64 {d3}, [ip], r2
|
|
875 vld1.64 {d4}, [ip], r2
|
|
876 vrhadd.u8 q0, q0, q4
|
|
877 vld1.64 {d5}, [ip], r2
|
|
878 vrhadd.u8 q1, q1, q6
|
|
879 vld1.64 {d10}, [ip], r2
|
|
880 vrhadd.u8 q2, q2, q11
|
|
881 vld1.64 {d11}, [ip], r2
|
|
882
|
|
883 vst1.64 {d0}, [r0,:64], r3
|
|
884 vst1.64 {d1}, [r0,:64], r3
|
|
885 vrhadd.u8 q5, q5, q13
|
|
886 vst1.64 {d2}, [r0,:64], r3
|
|
887 vst1.64 {d3}, [r0,:64], r3
|
|
888 vst1.64 {d4}, [r0,:64], r3
|
|
889 vst1.64 {d5}, [r0,:64], r3
|
|
890 vst1.64 {d10}, [r0,:64], r3
|
|
891 vst1.64 {d11}, [r0,:64], r3
|
|
892
|
|
893 bx lr
|
|
894 .endfunc
|
|
895
|
|
896 function put_h264_qpel8_hv_lowpass_neon_top
|
|
897 lowpass_const ip
|
|
898 mov ip, #12
|
|
899 1: vld1.64 {d0, d1}, [r1], r3
|
|
900 vld1.64 {d16,d17}, [r1], r3
|
|
901 subs ip, ip, #2
|
|
902 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
|
|
903 vst1.64 {d22-d25}, [r4,:128]!
|
|
904 bne 1b
|
|
905
|
|
906 vld1.64 {d0, d1}, [r1]
|
|
907 lowpass_8_1 d0, d1, q12, narrow=0
|
|
908
|
|
909 mov ip, #-16
|
|
910 add r4, r4, ip
|
|
911 vld1.64 {d30,d31}, [r4,:128], ip
|
|
912 vld1.64 {d20,d21}, [r4,:128], ip
|
|
913 vld1.64 {d18,d19}, [r4,:128], ip
|
|
914 vld1.64 {d16,d17}, [r4,:128], ip
|
|
915 vld1.64 {d14,d15}, [r4,:128], ip
|
|
916 vld1.64 {d12,d13}, [r4,:128], ip
|
|
917 vld1.64 {d10,d11}, [r4,:128], ip
|
|
918 vld1.64 {d8, d9}, [r4,:128], ip
|
|
919 vld1.64 {d6, d7}, [r4,:128], ip
|
|
920 vld1.64 {d4, d5}, [r4,:128], ip
|
|
921 vld1.64 {d2, d3}, [r4,:128], ip
|
|
922 vld1.64 {d0, d1}, [r4,:128]
|
|
923
|
|
924 swap4 d1, d3, d5, d7, d8, d10, d12, d14
|
|
925 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
|
|
926
|
|
927 swap4 d17, d19, d21, d31, d24, d26, d28, d22
|
|
928 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
|
|
929
|
|
930 vst1.64 {d30,d31}, [r4,:128]!
|
|
931 vst1.64 {d6, d7}, [r4,:128]!
|
|
932 vst1.64 {d20,d21}, [r4,:128]!
|
|
933 vst1.64 {d4, d5}, [r4,:128]!
|
|
934 vst1.64 {d18,d19}, [r4,:128]!
|
|
935 vst1.64 {d2, d3}, [r4,:128]!
|
|
936 vst1.64 {d16,d17}, [r4,:128]!
|
|
937 vst1.64 {d0, d1}, [r4,:128]
|
|
938
|
|
939 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
|
|
940 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
|
|
941 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
|
|
942 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
|
|
943
|
|
944 vld1.64 {d16,d17}, [r4,:128], ip
|
|
945 vld1.64 {d30,d31}, [r4,:128], ip
|
|
946 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
|
|
947 vld1.64 {d16,d17}, [r4,:128], ip
|
|
948 vld1.64 {d30,d31}, [r4,:128], ip
|
|
949 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
|
|
950 vld1.64 {d16,d17}, [r4,:128], ip
|
|
951 vld1.64 {d30,d31}, [r4,:128], ip
|
|
952 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
|
|
953 vld1.64 {d16,d17}, [r4,:128], ip
|
|
954 vld1.64 {d30,d31}, [r4,:128]
|
|
955 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
|
|
956
|
|
957 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
|
|
958
|
|
959 bx lr
|
|
960 .endfunc
|
|
961
|
|
962 function put_h264_qpel8_hv_lowpass_neon
|
|
963 mov r10, lr
|
|
964 bl put_h264_qpel8_hv_lowpass_neon_top
|
|
965 vst1.64 {d12}, [r0,:64], r2
|
|
966 vst1.64 {d13}, [r0,:64], r2
|
|
967 vst1.64 {d14}, [r0,:64], r2
|
|
968 vst1.64 {d15}, [r0,:64], r2
|
|
969 vst1.64 {d8}, [r0,:64], r2
|
|
970 vst1.64 {d9}, [r0,:64], r2
|
|
971 vst1.64 {d10}, [r0,:64], r2
|
|
972 vst1.64 {d11}, [r0,:64], r2
|
|
973
|
|
974 mov lr, r10
|
|
975 bx lr
|
|
976 .endfunc
|
|
977
|
|
978 function put_h264_qpel8_hv_lowpass_l2_neon
|
|
979 mov r10, lr
|
|
980 bl put_h264_qpel8_hv_lowpass_neon_top
|
|
981
|
|
982 vld1.64 {d0, d1}, [r2,:128]!
|
|
983 vld1.64 {d2, d3}, [r2,:128]!
|
|
984 vrhadd.u8 q0, q0, q6
|
|
985 vld1.64 {d4, d5}, [r2,:128]!
|
|
986 vrhadd.u8 q1, q1, q7
|
|
987 vld1.64 {d6, d7}, [r2,:128]!
|
|
988 vrhadd.u8 q2, q2, q4
|
|
989
|
|
990 vst1.64 {d0}, [r0,:64], r3
|
|
991 vrhadd.u8 q3, q3, q5
|
|
992 vst1.64 {d1}, [r0,:64], r3
|
|
993 vst1.64 {d2}, [r0,:64], r3
|
|
994 vst1.64 {d3}, [r0,:64], r3
|
|
995 vst1.64 {d4}, [r0,:64], r3
|
|
996 vst1.64 {d5}, [r0,:64], r3
|
|
997 vst1.64 {d6}, [r0,:64], r3
|
|
998 vst1.64 {d7}, [r0,:64], r3
|
|
999
|
|
1000 mov lr, r10
|
|
1001 bx lr
|
|
1002 .endfunc
|
|
1003
|
|
1004 function put_h264_qpel16_hv_lowpass_neon
|
|
1005 mov r9, lr
|
|
1006 bl put_h264_qpel8_hv_lowpass_neon
|
|
1007 sub r1, r1, r3, lsl #2
|
|
1008 bl put_h264_qpel8_hv_lowpass_neon
|
|
1009 sub r1, r1, r3, lsl #4
|
|
1010 sub r1, r1, r3, lsl #2
|
|
1011 add r1, r1, #8
|
|
1012 sub r0, r0, r2, lsl #4
|
|
1013 add r0, r0, #8
|
|
1014 bl put_h264_qpel8_hv_lowpass_neon
|
|
1015 sub r1, r1, r3, lsl #2
|
|
1016 mov lr, r9
|
|
1017 b put_h264_qpel8_hv_lowpass_neon
|
|
1018 .endfunc
|
|
1019
|
|
1020 function put_h264_qpel16_hv_lowpass_l2_neon
|
|
1021 mov r9, lr
|
|
1022 sub r2, r4, #256
|
|
1023 bl put_h264_qpel8_hv_lowpass_l2_neon
|
|
1024 sub r1, r1, r3, lsl #2
|
|
1025 bl put_h264_qpel8_hv_lowpass_l2_neon
|
|
1026 sub r1, r1, r3, lsl #4
|
|
1027 sub r1, r1, r3, lsl #2
|
|
1028 add r1, r1, #8
|
|
1029 sub r0, r0, r3, lsl #4
|
|
1030 add r0, r0, #8
|
|
1031 bl put_h264_qpel8_hv_lowpass_l2_neon
|
|
1032 sub r1, r1, r3, lsl #2
|
|
1033 mov lr, r9
|
|
1034 b put_h264_qpel8_hv_lowpass_l2_neon
|
|
1035 .endfunc
|
|
1036
|
|
1037 function ff_put_h264_qpel8_mc10_neon, export=1
|
|
1038 lowpass_const r3
|
|
1039 mov r3, r1
|
|
1040 sub r1, r1, #2
|
|
1041 mov ip, #8
|
|
1042 b put_h264_qpel8_h_lowpass_l2_neon
|
|
1043 .endfunc
|
|
1044
|
|
1045 function ff_put_h264_qpel8_mc20_neon, export=1
|
|
1046 lowpass_const r3
|
|
1047 sub r1, r1, #2
|
|
1048 mov r3, r2
|
|
1049 mov ip, #8
|
|
1050 b put_h264_qpel8_h_lowpass_neon
|
|
1051 .endfunc
|
|
1052
|
|
1053 function ff_put_h264_qpel8_mc30_neon, export=1
|
|
1054 lowpass_const r3
|
|
1055 add r3, r1, #1
|
|
1056 sub r1, r1, #2
|
|
1057 mov ip, #8
|
|
1058 b put_h264_qpel8_h_lowpass_l2_neon
|
|
1059 .endfunc
|
|
1060
|
|
1061 function ff_put_h264_qpel8_mc01_neon, export=1
|
|
1062 push {lr}
|
|
1063 mov ip, r1
|
|
1064 put_h264_qpel8_mc01:
|
|
1065 lowpass_const r3
|
|
1066 mov r3, r2
|
|
1067 sub r1, r1, r2, lsl #1
|
|
1068 vpush {d8-d15}
|
|
1069 bl put_h264_qpel8_v_lowpass_l2_neon
|
|
1070 vpop {d8-d15}
|
|
1071 pop {pc}
|
|
1072 .endfunc
|
|
1073
|
|
1074 function ff_put_h264_qpel8_mc11_neon, export=1
|
|
1075 push {r0, r1, r2, lr}
|
|
1076 put_h264_qpel8_mc11:
|
|
1077 lowpass_const r3
|
|
1078 sub sp, sp, #64
|
|
1079 mov r0, sp
|
|
1080 sub r1, r1, #2
|
|
1081 mov r3, #8
|
|
1082 mov ip, #8
|
|
1083 vpush {d8-d15}
|
|
1084 bl put_h264_qpel8_h_lowpass_neon
|
|
1085 ldrd r0, [sp, #128]
|
|
1086 mov r3, r2
|
|
1087 add ip, sp, #64
|
|
1088 sub r1, r1, r2, lsl #1
|
|
1089 mov r2, #8
|
|
1090 bl put_h264_qpel8_v_lowpass_l2_neon
|
|
1091 vpop {d8-d15}
|
|
1092 add sp, sp, #76
|
|
1093 pop {pc}
|
|
1094 .endfunc
|
|
1095
|
|
1096 function ff_put_h264_qpel8_mc21_neon, export=1
|
|
1097 push {r0, r1, r4, r10, r11, lr}
|
|
1098 put_h264_qpel8_mc21:
|
|
1099 lowpass_const r3
|
|
1100 mov r11, sp
|
|
1101 bic sp, sp, #15
|
|
1102 sub sp, sp, #(8*8+16*12)
|
|
1103 sub r1, r1, #2
|
|
1104 mov r3, #8
|
|
1105 mov r0, sp
|
|
1106 mov ip, #8
|
|
1107 vpush {d8-d15}
|
|
1108 bl put_h264_qpel8_h_lowpass_neon
|
|
1109 mov r4, r0
|
|
1110 ldrd r0, [r11]
|
|
1111 sub r1, r1, r2, lsl #1
|
|
1112 sub r1, r1, #2
|
|
1113 mov r3, r2
|
|
1114 sub r2, r4, #64
|
|
1115 bl put_h264_qpel8_hv_lowpass_l2_neon
|
|
1116 vpop {d8-d15}
|
|
1117 add sp, r11, #8
|
|
1118 pop {r4, r10, r11, pc}
|
|
1119 .endfunc
|
|
1120
|
|
1121 function ff_put_h264_qpel8_mc31_neon, export=1
|
|
1122 add r1, r1, #1
|
|
1123 push {r0, r1, r2, lr}
|
|
1124 sub r1, r1, #1
|
|
1125 b put_h264_qpel8_mc11
|
|
1126 .endfunc
|
|
1127
|
|
1128 function ff_put_h264_qpel8_mc02_neon, export=1
|
|
1129 push {lr}
|
|
1130 lowpass_const r3
|
|
1131 sub r1, r1, r2, lsl #1
|
|
1132 mov r3, r2
|
|
1133 vpush {d8-d15}
|
|
1134 bl put_h264_qpel8_v_lowpass_neon
|
|
1135 vpop {d8-d15}
|
|
1136 pop {pc}
|
|
1137 .endfunc
|
|
1138
|
|
1139 function ff_put_h264_qpel8_mc12_neon, export=1
|
|
1140 push {r0, r1, r4, r10, r11, lr}
|
|
1141 put_h264_qpel8_mc12:
|
|
1142 lowpass_const r3
|
|
1143 mov r11, sp
|
|
1144 bic sp, sp, #15
|
|
1145 sub sp, sp, #(8*8+16*12)
|
|
1146 sub r1, r1, r2, lsl #1
|
|
1147 mov r3, r2
|
|
1148 mov r2, #8
|
|
1149 mov r0, sp
|
|
1150 vpush {d8-d15}
|
|
1151 bl put_h264_qpel8_v_lowpass_neon
|
|
1152 mov r4, r0
|
|
1153 ldrd r0, [r11]
|
|
1154 sub r1, r1, r3, lsl #1
|
|
1155 sub r1, r1, #2
|
|
1156 sub r2, r4, #64
|
|
1157 bl put_h264_qpel8_hv_lowpass_l2_neon
|
|
1158 vpop {d8-d15}
|
|
1159 add sp, r11, #8
|
|
1160 pop {r4, r10, r11, pc}
|
|
1161 .endfunc
|
|
1162
|
|
1163 function ff_put_h264_qpel8_mc22_neon, export=1
|
|
1164 push {r4, r10, r11, lr}
|
|
1165 mov r11, sp
|
|
1166 bic sp, sp, #15
|
|
1167 sub r1, r1, r2, lsl #1
|
|
1168 sub r1, r1, #2
|
|
1169 mov r3, r2
|
|
1170 sub sp, sp, #(16*12)
|
|
1171 mov r4, sp
|
|
1172 vpush {d8-d15}
|
|
1173 bl put_h264_qpel8_hv_lowpass_neon
|
|
1174 vpop {d8-d15}
|
|
1175 mov sp, r11
|
|
1176 pop {r4, r10, r11, pc}
|
|
1177 .endfunc
|
|
1178
|
|
1179 function ff_put_h264_qpel8_mc32_neon, export=1
|
|
1180 push {r0, r1, r4, r10, r11, lr}
|
|
1181 add r1, r1, #1
|
|
1182 b put_h264_qpel8_mc12
|
|
1183 .endfunc
|
|
1184
|
|
1185 function ff_put_h264_qpel8_mc03_neon, export=1
|
|
1186 push {lr}
|
|
1187 add ip, r1, r2
|
|
1188 b put_h264_qpel8_mc01
|
|
1189 .endfunc
|
|
1190
|
|
1191 function ff_put_h264_qpel8_mc13_neon, export=1
|
|
1192 push {r0, r1, r2, lr}
|
|
1193 add r1, r1, r2
|
|
1194 b put_h264_qpel8_mc11
|
|
1195 .endfunc
|
|
1196
|
|
1197 function ff_put_h264_qpel8_mc23_neon, export=1
|
|
1198 push {r0, r1, r4, r10, r11, lr}
|
|
1199 add r1, r1, r2
|
|
1200 b put_h264_qpel8_mc21
|
|
1201 .endfunc
|
|
1202
|
|
1203 function ff_put_h264_qpel8_mc33_neon, export=1
|
|
1204 add r1, r1, #1
|
|
1205 push {r0, r1, r2, lr}
|
|
1206 add r1, r1, r2
|
|
1207 sub r1, r1, #1
|
|
1208 b put_h264_qpel8_mc11
|
|
1209 .endfunc
|
|
1210
|
|
1211 function ff_put_h264_qpel16_mc10_neon, export=1
|
|
1212 lowpass_const r3
|
|
1213 mov r3, r1
|
|
1214 sub r1, r1, #2
|
|
1215 b put_h264_qpel16_h_lowpass_l2_neon
|
|
1216 .endfunc
|
|
1217
|
|
1218 function ff_put_h264_qpel16_mc20_neon, export=1
|
|
1219 lowpass_const r3
|
|
1220 sub r1, r1, #2
|
|
1221 mov r3, r2
|
|
1222 b put_h264_qpel16_h_lowpass_neon
|
|
1223 .endfunc
|
|
1224
|
|
1225 function ff_put_h264_qpel16_mc30_neon, export=1
|
|
1226 lowpass_const r3
|
|
1227 add r3, r1, #1
|
|
1228 sub r1, r1, #2
|
|
1229 b put_h264_qpel16_h_lowpass_l2_neon
|
|
1230 .endfunc
|
|
1231
|
|
1232 function ff_put_h264_qpel16_mc01_neon, export=1
|
|
1233 push {r4, lr}
|
|
1234 mov ip, r1
|
|
1235 put_h264_qpel16_mc01:
|
|
1236 lowpass_const r3
|
|
1237 mov r3, r2
|
|
1238 sub r1, r1, r2, lsl #1
|
|
1239 vpush {d8-d15}
|
|
1240 bl put_h264_qpel16_v_lowpass_l2_neon
|
|
1241 vpop {d8-d15}
|
|
1242 pop {r4, pc}
|
|
1243 .endfunc
|
|
1244
|
|
1245 function ff_put_h264_qpel16_mc11_neon, export=1
|
|
1246 push {r0, r1, r4, lr}
|
|
1247 put_h264_qpel16_mc11:
|
|
1248 lowpass_const r3
|
|
1249 sub sp, sp, #256
|
|
1250 mov r0, sp
|
|
1251 sub r1, r1, #2
|
|
1252 mov r3, #16
|
|
1253 vpush {d8-d15}
|
|
1254 bl put_h264_qpel16_h_lowpass_neon
|
|
1255 add r0, sp, #256
|
|
1256 ldrd r0, [r0, #64]
|
|
1257 mov r3, r2
|
|
1258 add ip, sp, #64
|
|
1259 sub r1, r1, r2, lsl #1
|
|
1260 mov r2, #16
|
|
1261 bl put_h264_qpel16_v_lowpass_l2_neon
|
|
1262 vpop {d8-d15}
|
|
1263 add sp, sp, #(256+8)
|
|
1264 pop {r4, pc}
|
|
1265 .endfunc
|
|
1266
|
|
1267 function ff_put_h264_qpel16_mc21_neon, export=1
|
|
1268 push {r0, r1, r4-r5, r9-r11, lr}
|
|
1269 put_h264_qpel16_mc21:
|
|
1270 lowpass_const r3
|
|
1271 mov r11, sp
|
|
1272 bic sp, sp, #15
|
|
1273 sub sp, sp, #(16*16+16*12)
|
|
1274 sub r1, r1, #2
|
|
1275 mov r0, sp
|
|
1276 vpush {d8-d15}
|
|
1277 bl put_h264_qpel16_h_lowpass_neon_packed
|
|
1278 mov r4, r0
|
|
1279 ldrd r0, [r11]
|
|
1280 sub r1, r1, r2, lsl #1
|
|
1281 sub r1, r1, #2
|
|
1282 mov r3, r2
|
|
1283 bl put_h264_qpel16_hv_lowpass_l2_neon
|
|
1284 vpop {d8-d15}
|
|
1285 add sp, r11, #8
|
|
1286 pop {r4-r5, r9-r11, pc}
|
|
1287 .endfunc
|
|
1288
|
|
1289 function ff_put_h264_qpel16_mc31_neon, export=1
|
|
1290 add r1, r1, #1
|
|
1291 push {r0, r1, r4, lr}
|
|
1292 sub r1, r1, #1
|
|
1293 b put_h264_qpel16_mc11
|
|
1294 .endfunc
|
|
1295
|
|
1296 function ff_put_h264_qpel16_mc02_neon, export=1
|
|
1297 push {r4, lr}
|
|
1298 lowpass_const r3
|
|
1299 sub r1, r1, r2, lsl #1
|
|
1300 mov r3, r2
|
|
1301 vpush {d8-d15}
|
|
1302 bl put_h264_qpel16_v_lowpass_neon
|
|
1303 vpop {d8-d15}
|
|
1304 pop {r4, pc}
|
|
1305 .endfunc
|
|
1306
|
|
1307 function ff_put_h264_qpel16_mc12_neon, export=1
|
|
1308 push {r0, r1, r4-r5, r9-r11, lr}
|
|
1309 put_h264_qpel16_mc12:
|
|
1310 lowpass_const r3
|
|
1311 mov r11, sp
|
|
1312 bic sp, sp, #15
|
|
1313 sub sp, sp, #(16*16+16*12)
|
|
1314 sub r1, r1, r2, lsl #1
|
|
1315 mov r0, sp
|
|
1316 mov r3, r2
|
|
1317 vpush {d8-d15}
|
|
1318 bl put_h264_qpel16_v_lowpass_neon_packed
|
|
1319 mov r4, r0
|
|
1320 ldrd r0, [r11]
|
|
1321 sub r1, r1, r3, lsl #1
|
|
1322 sub r1, r1, #2
|
|
1323 mov r2, r3
|
|
1324 bl put_h264_qpel16_hv_lowpass_l2_neon
|
|
1325 vpop {d8-d15}
|
|
1326 add sp, r11, #8
|
|
1327 pop {r4-r5, r9-r11, pc}
|
|
1328 .endfunc
|
|
1329
|
|
1330 function ff_put_h264_qpel16_mc22_neon, export=1
|
|
1331 push {r4, r9-r11, lr}
|
|
1332 lowpass_const r3
|
|
1333 mov r11, sp
|
|
1334 bic sp, sp, #15
|
|
1335 sub r1, r1, r2, lsl #1
|
|
1336 sub r1, r1, #2
|
|
1337 mov r3, r2
|
|
1338 sub sp, sp, #(16*12)
|
|
1339 mov r4, sp
|
|
1340 vpush {d8-d15}
|
|
1341 bl put_h264_qpel16_hv_lowpass_neon
|
|
1342 vpop {d8-d15}
|
|
1343 mov sp, r11
|
|
1344 pop {r4, r9-r11, pc}
|
|
1345 .endfunc
|
|
1346
|
|
1347 function ff_put_h264_qpel16_mc32_neon, export=1
|
|
1348 push {r0, r1, r4-r5, r9-r11, lr}
|
|
1349 add r1, r1, #1
|
|
1350 b put_h264_qpel16_mc12
|
|
1351 .endfunc
|
|
1352
|
|
1353 function ff_put_h264_qpel16_mc03_neon, export=1
|
|
1354 push {r4, lr}
|
|
1355 add ip, r1, r2
|
|
1356 b put_h264_qpel16_mc01
|
|
1357 .endfunc
|
|
1358
|
|
1359 function ff_put_h264_qpel16_mc13_neon, export=1
|
|
1360 push {r0, r1, r4, lr}
|
|
1361 add r1, r1, r2
|
|
1362 b put_h264_qpel16_mc11
|
|
1363 .endfunc
|
|
1364
|
|
1365 function ff_put_h264_qpel16_mc23_neon, export=1
|
|
1366 push {r0, r1, r4-r5, r9-r11, lr}
|
|
1367 add r1, r1, r2
|
|
1368 b put_h264_qpel16_mc21
|
|
1369 .endfunc
|
|
1370
|
|
1371 function ff_put_h264_qpel16_mc33_neon, export=1
|
|
1372 add r1, r1, #1
|
|
1373 push {r0, r1, r4, lr}
|
|
1374 add r1, r1, r2
|
|
1375 sub r1, r1, #1
|
|
1376 b put_h264_qpel16_mc11
|
|
1377 .endfunc
|