Mercurial > libavcodec.hg
annotate arm/h264dsp_neon.S @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | 69bbfd8f2ba5 |
children |
rev | line source |
---|---|
8336 | 1 /* |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
8338 | 23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 |
24 vtrn.32 \r0, \r4 | |
25 vtrn.32 \r1, \r5 | |
26 vtrn.32 \r2, \r6 | |
27 vtrn.32 \r3, \r7 | |
28 vtrn.16 \r0, \r2 | |
29 vtrn.16 \r1, \r3 | |
30 vtrn.16 \r4, \r6 | |
31 vtrn.16 \r5, \r7 | |
32 vtrn.8 \r0, \r1 | |
33 vtrn.8 \r2, \r3 | |
34 vtrn.8 \r4, \r5 | |
35 vtrn.8 \r6, \r7 | |
36 .endm | |
37 | |
9864 | 38 .macro transpose_4x4 r0 r1 r2 r3 |
39 vtrn.16 \r0, \r2 | |
40 vtrn.16 \r1, \r3 | |
41 vtrn.8 \r0, \r1 | |
42 vtrn.8 \r2, \r3 | |
43 .endm | |
44 | |
8338 | 45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 |
46 vswp \r0, \r4 | |
47 vswp \r1, \r5 | |
48 vswp \r2, \r6 | |
49 vswp \r3, \r7 | |
50 .endm | |
51 | |
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
53 vtrn.32 \r0, \r2 | |
54 vtrn.32 \r1, \r3 | |
55 vtrn.32 \r4, \r6 | |
56 vtrn.32 \r5, \r7 | |
57 vtrn.16 \r0, \r1 | |
58 vtrn.16 \r2, \r3 | |
59 vtrn.16 \r4, \r5 | |
60 vtrn.16 \r6, \r7 | |
61 .endm | |
62 | |
8336 | 63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
64 .macro h264_chroma_mc8 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
65 function ff_\type\()_h264_chroma_mc8_neon, export=1 |
8336 | 66 push {r4-r7, lr} |
67 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
68 .ifc \type,avg |
8336 | 69 mov lr, r0 |
70 .endif | |
71 pld [r1] | |
72 pld [r1, r2] | |
73 | |
74 muls r7, r4, r5 | |
75 rsb r6, r7, r5, lsl #3 | |
76 rsb ip, r7, r4, lsl #3 | |
77 sub r4, r7, r4, lsl #3 | |
78 sub r4, r4, r5, lsl #3 | |
79 add r4, r4, #64 | |
80 | |
81 beq 2f | |
82 | |
83 add r5, r1, r2 | |
84 | |
85 vdup.8 d0, r4 | |
86 lsl r4, r2, #1 | |
87 vdup.8 d1, ip | |
88 vld1.64 {d4, d5}, [r1], r4 | |
89 vdup.8 d2, r6 | |
90 vld1.64 {d6, d7}, [r5], r4 | |
91 vdup.8 d3, r7 | |
92 | |
93 vext.8 d5, d4, d5, #1 | |
94 vext.8 d7, d6, d7, #1 | |
95 | |
96 1: pld [r5] | |
97 vmull.u8 q8, d4, d0 | |
98 vmlal.u8 q8, d5, d1 | |
99 vld1.64 {d4, d5}, [r1], r4 | |
100 vmlal.u8 q8, d6, d2 | |
101 vext.8 d5, d4, d5, #1 | |
102 vmlal.u8 q8, d7, d3 | |
103 vmull.u8 q9, d6, d0 | |
104 subs r3, r3, #2 | |
105 vmlal.u8 q9, d7, d1 | |
106 vmlal.u8 q9, d4, d2 | |
107 vmlal.u8 q9, d5, d3 | |
108 vrshrn.u16 d16, q8, #6 | |
109 vld1.64 {d6, d7}, [r5], r4 | |
110 pld [r1] | |
111 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
112 .ifc \type,avg |
8336 | 113 vld1.64 {d20}, [lr,:64], r2 |
114 vld1.64 {d21}, [lr,:64], r2 | |
115 vrhadd.u8 q8, q8, q10 | |
116 .endif | |
117 vext.8 d7, d6, d7, #1 | |
118 vst1.64 {d16}, [r0,:64], r2 | |
119 vst1.64 {d17}, [r0,:64], r2 | |
120 bgt 1b | |
121 | |
122 pop {r4-r7, pc} | |
123 | |
124 2: tst r6, r6 | |
125 add ip, ip, r6 | |
126 vdup.8 d0, r4 | |
127 vdup.8 d1, ip | |
128 | |
129 beq 4f | |
130 | |
131 add r5, r1, r2 | |
132 lsl r4, r2, #1 | |
133 vld1.64 {d4}, [r1], r4 | |
134 vld1.64 {d6}, [r5], r4 | |
135 | |
136 3: pld [r5] | |
137 vmull.u8 q8, d4, d0 | |
138 vmlal.u8 q8, d6, d1 | |
139 vld1.64 {d4}, [r1], r4 | |
140 vmull.u8 q9, d6, d0 | |
141 vmlal.u8 q9, d4, d1 | |
142 vld1.64 {d6}, [r5], r4 | |
143 vrshrn.u16 d16, q8, #6 | |
144 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
145 .ifc \type,avg |
8336 | 146 vld1.64 {d20}, [lr,:64], r2 |
147 vld1.64 {d21}, [lr,:64], r2 | |
148 vrhadd.u8 q8, q8, q10 | |
149 .endif | |
150 subs r3, r3, #2 | |
151 pld [r1] | |
152 vst1.64 {d16}, [r0,:64], r2 | |
153 vst1.64 {d17}, [r0,:64], r2 | |
154 bgt 3b | |
155 | |
156 pop {r4-r7, pc} | |
157 | |
158 4: vld1.64 {d4, d5}, [r1], r2 | |
159 vld1.64 {d6, d7}, [r1], r2 | |
160 vext.8 d5, d4, d5, #1 | |
161 vext.8 d7, d6, d7, #1 | |
162 | |
163 5: pld [r1] | |
164 subs r3, r3, #2 | |
165 vmull.u8 q8, d4, d0 | |
166 vmlal.u8 q8, d5, d1 | |
167 vld1.64 {d4, d5}, [r1], r2 | |
168 vmull.u8 q9, d6, d0 | |
169 vmlal.u8 q9, d7, d1 | |
170 pld [r1] | |
171 vext.8 d5, d4, d5, #1 | |
172 vrshrn.u16 d16, q8, #6 | |
173 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
174 .ifc \type,avg |
8336 | 175 vld1.64 {d20}, [lr,:64], r2 |
176 vld1.64 {d21}, [lr,:64], r2 | |
177 vrhadd.u8 q8, q8, q10 | |
178 .endif | |
179 vld1.64 {d6, d7}, [r1], r2 | |
180 vext.8 d7, d6, d7, #1 | |
181 vst1.64 {d16}, [r0,:64], r2 | |
182 vst1.64 {d17}, [r0,:64], r2 | |
183 bgt 5b | |
184 | |
185 pop {r4-r7, pc} | |
11443 | 186 endfunc |
8336 | 187 .endm |
188 | |
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
190 .macro h264_chroma_mc4 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
191 function ff_\type\()_h264_chroma_mc4_neon, export=1 |
8336 | 192 push {r4-r7, lr} |
193 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
194 .ifc \type,avg |
8336 | 195 mov lr, r0 |
196 .endif | |
197 pld [r1] | |
198 pld [r1, r2] | |
199 | |
200 muls r7, r4, r5 | |
201 rsb r6, r7, r5, lsl #3 | |
202 rsb ip, r7, r4, lsl #3 | |
203 sub r4, r7, r4, lsl #3 | |
204 sub r4, r4, r5, lsl #3 | |
205 add r4, r4, #64 | |
206 | |
207 beq 2f | |
208 | |
209 add r5, r1, r2 | |
210 | |
211 vdup.8 d0, r4 | |
212 lsl r4, r2, #1 | |
213 vdup.8 d1, ip | |
214 vld1.64 {d4}, [r1], r4 | |
215 vdup.8 d2, r6 | |
216 vld1.64 {d6}, [r5], r4 | |
217 vdup.8 d3, r7 | |
218 | |
219 vext.8 d5, d4, d5, #1 | |
220 vext.8 d7, d6, d7, #1 | |
221 vtrn.32 d4, d5 | |
222 vtrn.32 d6, d7 | |
223 | |
224 vtrn.32 d0, d1 | |
225 vtrn.32 d2, d3 | |
226 | |
227 1: pld [r5] | |
228 vmull.u8 q8, d4, d0 | |
229 vmlal.u8 q8, d6, d2 | |
230 vld1.64 {d4}, [r1], r4 | |
231 vext.8 d5, d4, d5, #1 | |
232 vtrn.32 d4, d5 | |
233 vmull.u8 q9, d6, d0 | |
234 vmlal.u8 q9, d4, d2 | |
235 vld1.64 {d6}, [r5], r4 | |
236 vadd.i16 d16, d16, d17 | |
237 vadd.i16 d17, d18, d19 | |
238 vrshrn.u16 d16, q8, #6 | |
239 subs r3, r3, #2 | |
240 pld [r1] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
241 .ifc \type,avg |
8336 | 242 vld1.32 {d20[0]}, [lr,:32], r2 |
243 vld1.32 {d20[1]}, [lr,:32], r2 | |
244 vrhadd.u8 d16, d16, d20 | |
245 .endif | |
246 vext.8 d7, d6, d7, #1 | |
247 vtrn.32 d6, d7 | |
248 vst1.32 {d16[0]}, [r0,:32], r2 | |
249 vst1.32 {d16[1]}, [r0,:32], r2 | |
250 bgt 1b | |
251 | |
252 pop {r4-r7, pc} | |
253 | |
254 2: tst r6, r6 | |
255 add ip, ip, r6 | |
256 vdup.8 d0, r4 | |
257 vdup.8 d1, ip | |
258 vtrn.32 d0, d1 | |
259 | |
260 beq 4f | |
261 | |
262 vext.32 d1, d0, d1, #1 | |
263 add r5, r1, r2 | |
264 lsl r4, r2, #1 | |
265 vld1.32 {d4[0]}, [r1], r4 | |
266 vld1.32 {d4[1]}, [r5], r4 | |
267 | |
268 3: pld [r5] | |
269 vmull.u8 q8, d4, d0 | |
270 vld1.32 {d4[0]}, [r1], r4 | |
271 vmull.u8 q9, d4, d1 | |
272 vld1.32 {d4[1]}, [r5], r4 | |
273 vadd.i16 d16, d16, d17 | |
274 vadd.i16 d17, d18, d19 | |
275 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
276 .ifc \type,avg |
8336 | 277 vld1.32 {d20[0]}, [lr,:32], r2 |
278 vld1.32 {d20[1]}, [lr,:32], r2 | |
279 vrhadd.u8 d16, d16, d20 | |
280 .endif | |
281 subs r3, r3, #2 | |
282 pld [r1] | |
283 vst1.32 {d16[0]}, [r0,:32], r2 | |
284 vst1.32 {d16[1]}, [r0,:32], r2 | |
285 bgt 3b | |
286 | |
287 pop {r4-r7, pc} | |
288 | |
289 4: vld1.64 {d4}, [r1], r2 | |
290 vld1.64 {d6}, [r1], r2 | |
291 vext.8 d5, d4, d5, #1 | |
292 vext.8 d7, d6, d7, #1 | |
293 vtrn.32 d4, d5 | |
294 vtrn.32 d6, d7 | |
295 | |
296 5: vmull.u8 q8, d4, d0 | |
297 vmull.u8 q9, d6, d0 | |
298 subs r3, r3, #2 | |
299 vld1.64 {d4}, [r1], r2 | |
300 vext.8 d5, d4, d5, #1 | |
301 vtrn.32 d4, d5 | |
302 vadd.i16 d16, d16, d17 | |
303 vadd.i16 d17, d18, d19 | |
304 pld [r1] | |
305 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
306 .ifc \type,avg |
8336 | 307 vld1.32 {d20[0]}, [lr,:32], r2 |
308 vld1.32 {d20[1]}, [lr,:32], r2 | |
309 vrhadd.u8 d16, d16, d20 | |
310 .endif | |
311 vld1.64 {d6}, [r1], r2 | |
312 vext.8 d7, d6, d7, #1 | |
313 vtrn.32 d6, d7 | |
314 pld [r1] | |
315 vst1.32 {d16[0]}, [r0,:32], r2 | |
316 vst1.32 {d16[1]}, [r0,:32], r2 | |
317 bgt 5b | |
318 | |
319 pop {r4-r7, pc} | |
11443 | 320 endfunc |
8336 | 321 .endm |
322 | |
10617 | 323 .macro h264_chroma_mc2 type |
324 function ff_\type\()_h264_chroma_mc2_neon, export=1 | |
325 push {r4-r6, lr} | |
326 ldr r4, [sp, #16] | |
327 ldr lr, [sp, #20] | |
328 pld [r1] | |
329 pld [r1, r2] | |
330 orrs r5, r4, lr | |
331 beq 2f | |
332 | |
333 mul r5, r4, lr | |
334 rsb r6, r5, lr, lsl #3 | |
335 rsb r12, r5, r4, lsl #3 | |
336 sub r4, r5, r4, lsl #3 | |
337 sub r4, r4, lr, lsl #3 | |
338 add r4, r4, #64 | |
339 vdup.8 d0, r4 | |
340 vdup.8 d2, r12 | |
341 vdup.8 d1, r6 | |
342 vdup.8 d3, r5 | |
343 vtrn.16 q0, q1 | |
344 1: | |
345 vld1.32 {d4[0]}, [r1], r2 | |
346 vld1.32 {d4[1]}, [r1], r2 | |
347 vrev64.32 d5, d4 | |
348 vld1.32 {d5[1]}, [r1] | |
349 vext.8 q3, q2, q2, #1 | |
350 vtrn.16 q2, q3 | |
351 vmull.u8 q8, d4, d0 | |
352 vmlal.u8 q8, d5, d1 | |
353 .ifc \type,avg | |
354 vld1.16 {d18[0]}, [r0,:16], r2 | |
355 vld1.16 {d18[1]}, [r0,:16] | |
356 sub r0, r0, r2 | |
357 .endif | |
358 vtrn.32 d16, d17 | |
359 vadd.i16 d16, d16, d17 | |
360 vrshrn.u16 d16, q8, #6 | |
361 .ifc \type,avg | |
362 vrhadd.u8 d16, d16, d18 | |
363 .endif | |
364 vst1.16 {d16[0]}, [r0,:16], r2 | |
365 vst1.16 {d16[1]}, [r0,:16], r2 | |
366 subs r3, r3, #2 | |
367 bgt 1b | |
368 pop {r4-r6, pc} | |
369 2: | |
370 .ifc \type,put | |
371 ldrh r5, [r1], r2 | |
372 strh r5, [r0], r2 | |
373 ldrh r6, [r1], r2 | |
374 strh r6, [r0], r2 | |
375 .else | |
376 vld1.16 {d16[0]}, [r1], r2 | |
377 vld1.16 {d16[1]}, [r1], r2 | |
378 vld1.16 {d18[0]}, [r0,:16], r2 | |
379 vld1.16 {d18[1]}, [r0,:16] | |
380 sub r0, r0, r2 | |
381 vrhadd.u8 d16, d16, d18 | |
382 vst1.16 {d16[0]}, [r0,:16], r2 | |
383 vst1.16 {d16[1]}, [r0,:16], r2 | |
384 .endif | |
385 subs r3, r3, #2 | |
386 bgt 2b | |
387 pop {r4-r6, pc} | |
11443 | 388 endfunc |
10617 | 389 .endm |
390 | |
8336 | 391 .text |
392 .align | |
393 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
394 h264_chroma_mc8 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
395 h264_chroma_mc8 avg |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
396 h264_chroma_mc4 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
397 h264_chroma_mc4 avg |
10617 | 398 h264_chroma_mc2 put |
399 h264_chroma_mc2 avg | |
8337 | 400 |
401 /* H.264 loop filter */ | |
402 | |
403 .macro h264_loop_filter_start | |
404 ldr ip, [sp] | |
405 tst r2, r2 | |
406 ldr ip, [ip] | |
407 tstne r3, r3 | |
408 vmov.32 d24[0], ip | |
409 and ip, ip, ip, lsl #16 | |
410 bxeq lr | |
411 ands ip, ip, ip, lsl #8 | |
412 bxlt lr | |
413 .endm | |
414 | |
415 .macro align_push_regs | |
416 and ip, sp, #15 | |
417 add ip, ip, #32 | |
418 sub sp, sp, ip | |
419 vst1.64 {d12-d15}, [sp,:128] | |
420 sub sp, sp, #32 | |
421 vst1.64 {d8-d11}, [sp,:128] | |
422 .endm | |
423 | |
424 .macro align_pop_regs | |
425 vld1.64 {d8-d11}, [sp,:128]! | |
426 vld1.64 {d12-d15}, [sp,:128], ip | |
427 .endm | |
428 | |
429 .macro h264_loop_filter_luma | |
430 vdup.8 q11, r2 @ alpha | |
431 vmovl.u8 q12, d24 | |
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
433 vmovl.u16 q12, d24 | |
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
435 vsli.16 q12, q12, #8 | |
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
437 vsli.32 q12, q12, #16 | |
438 vclt.u8 q6, q6, q11 @ < alpha | |
439 vdup.8 q11, r3 @ beta | |
440 vclt.s8 q7, q12, #0 | |
441 vclt.u8 q14, q14, q11 @ < beta | |
442 vclt.u8 q15, q15, q11 @ < beta | |
443 vbic q6, q6, q7 | |
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
445 vand q6, q6, q14 | |
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
447 vclt.u8 q4, q4, q11 @ < beta | |
448 vand q6, q6, q15 | |
449 vclt.u8 q5, q5, q11 @ < beta | |
450 vand q4, q4, q6 | |
451 vand q5, q5, q6 | |
452 vand q12, q12, q6 | |
453 vrhadd.u8 q14, q8, q0 | |
454 vsub.i8 q6, q12, q4 | |
455 vqadd.u8 q7, q9, q12 | |
456 vhadd.u8 q10, q10, q14 | |
457 vsub.i8 q6, q6, q5 | |
458 vhadd.u8 q14, q2, q14 | |
459 vmin.u8 q7, q7, q10 | |
460 vqsub.u8 q11, q9, q12 | |
461 vqadd.u8 q2, q1, q12 | |
462 vmax.u8 q7, q7, q11 | |
463 vqsub.u8 q11, q1, q12 | |
464 vmin.u8 q14, q2, q14 | |
465 vmovl.u8 q2, d0 | |
466 vmax.u8 q14, q14, q11 | |
467 vmovl.u8 q10, d1 | |
468 vsubw.u8 q2, q2, d16 | |
469 vsubw.u8 q10, q10, d17 | |
470 vshl.i16 q2, q2, #2 | |
471 vshl.i16 q10, q10, #2 | |
472 vaddw.u8 q2, q2, d18 | |
473 vaddw.u8 q10, q10, d19 | |
474 vsubw.u8 q2, q2, d2 | |
475 vsubw.u8 q10, q10, d3 | |
476 vrshrn.i16 d4, q2, #3 | |
477 vrshrn.i16 d5, q10, #3 | |
478 vbsl q4, q7, q9 | |
479 vbsl q5, q14, q1 | |
480 vneg.s8 q7, q6 | |
481 vmovl.u8 q14, d16 | |
482 vmin.s8 q2, q2, q6 | |
483 vmovl.u8 q6, d17 | |
484 vmax.s8 q2, q2, q7 | |
485 vmovl.u8 q11, d0 | |
486 vmovl.u8 q12, d1 | |
487 vaddw.s8 q14, q14, d4 | |
488 vaddw.s8 q6, q6, d5 | |
489 vsubw.s8 q11, q11, d4 | |
490 vsubw.s8 q12, q12, d5 | |
491 vqmovun.s16 d16, q14 | |
492 vqmovun.s16 d17, q6 | |
493 vqmovun.s16 d0, q11 | |
494 vqmovun.s16 d1, q12 | |
495 .endm | |
496 | |
497 function ff_h264_v_loop_filter_luma_neon, export=1 | |
498 h264_loop_filter_start | |
499 | |
500 vld1.64 {d0, d1}, [r0,:128], r1 | |
501 vld1.64 {d2, d3}, [r0,:128], r1 | |
502 vld1.64 {d4, d5}, [r0,:128], r1 | |
503 sub r0, r0, r1, lsl #2 | |
504 sub r0, r0, r1, lsl #1 | |
505 vld1.64 {d20,d21}, [r0,:128], r1 | |
506 vld1.64 {d18,d19}, [r0,:128], r1 | |
507 vld1.64 {d16,d17}, [r0,:128], r1 | |
508 | |
509 align_push_regs | |
510 | |
511 h264_loop_filter_luma | |
512 | |
513 sub r0, r0, r1, lsl #1 | |
514 vst1.64 {d8, d9}, [r0,:128], r1 | |
515 vst1.64 {d16,d17}, [r0,:128], r1 | |
516 vst1.64 {d0, d1}, [r0,:128], r1 | |
517 vst1.64 {d10,d11}, [r0,:128] | |
518 | |
519 align_pop_regs | |
520 bx lr | |
11443 | 521 endfunc |
8337 | 522 |
523 function ff_h264_h_loop_filter_luma_neon, export=1 | |
524 h264_loop_filter_start | |
525 | |
526 sub r0, r0, #4 | |
527 vld1.64 {d6}, [r0], r1 | |
528 vld1.64 {d20}, [r0], r1 | |
529 vld1.64 {d18}, [r0], r1 | |
530 vld1.64 {d16}, [r0], r1 | |
531 vld1.64 {d0}, [r0], r1 | |
532 vld1.64 {d2}, [r0], r1 | |
533 vld1.64 {d4}, [r0], r1 | |
534 vld1.64 {d26}, [r0], r1 | |
535 vld1.64 {d7}, [r0], r1 | |
536 vld1.64 {d21}, [r0], r1 | |
537 vld1.64 {d19}, [r0], r1 | |
538 vld1.64 {d17}, [r0], r1 | |
539 vld1.64 {d1}, [r0], r1 | |
540 vld1.64 {d3}, [r0], r1 | |
541 vld1.64 {d5}, [r0], r1 | |
542 vld1.64 {d27}, [r0], r1 | |
543 | |
8338 | 544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 |
8337 | 545 |
546 align_push_regs | |
547 | |
548 h264_loop_filter_luma | |
549 | |
9864 | 550 transpose_4x4 q4, q8, q0, q5 |
8337 | 551 |
552 sub r0, r0, r1, lsl #4 | |
9864 | 553 add r0, r0, #2 |
554 vst1.32 {d8[0]}, [r0], r1 | |
555 vst1.32 {d16[0]}, [r0], r1 | |
556 vst1.32 {d0[0]}, [r0], r1 | |
557 vst1.32 {d10[0]}, [r0], r1 | |
558 vst1.32 {d8[1]}, [r0], r1 | |
559 vst1.32 {d16[1]}, [r0], r1 | |
560 vst1.32 {d0[1]}, [r0], r1 | |
561 vst1.32 {d10[1]}, [r0], r1 | |
562 vst1.32 {d9[0]}, [r0], r1 | |
563 vst1.32 {d17[0]}, [r0], r1 | |
564 vst1.32 {d1[0]}, [r0], r1 | |
565 vst1.32 {d11[0]}, [r0], r1 | |
566 vst1.32 {d9[1]}, [r0], r1 | |
567 vst1.32 {d17[1]}, [r0], r1 | |
568 vst1.32 {d1[1]}, [r0], r1 | |
569 vst1.32 {d11[1]}, [r0], r1 | |
8337 | 570 |
571 align_pop_regs | |
572 bx lr | |
11443 | 573 endfunc |
8337 | 574 |
575 .macro h264_loop_filter_chroma | |
576 vdup.8 d22, r2 @ alpha | |
577 vmovl.u8 q12, d24 | |
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
579 vmovl.u8 q2, d0 | |
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
581 vsubw.u8 q2, q2, d16 | |
582 vsli.16 d24, d24, #8 | |
583 vshl.i16 q2, q2, #2 | |
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
585 vaddw.u8 q2, q2, d18 | |
586 vclt.u8 d26, d26, d22 @ < alpha | |
587 vsubw.u8 q2, q2, d2 | |
588 vdup.8 d22, r3 @ beta | |
589 vrshrn.i16 d4, q2, #3 | |
590 vclt.u8 d28, d28, d22 @ < beta | |
591 vclt.u8 d30, d30, d22 @ < beta | |
12167 | 592 vmin.s8 d4, d4, d24 |
593 vneg.s8 d25, d24 | |
8337 | 594 vand d26, d26, d28 |
12167 | 595 vmax.s8 d4, d4, d25 |
8337 | 596 vand d26, d26, d30 |
12167 | 597 vmovl.u8 q11, d0 |
598 vand d4, d4, d26 | |
8337 | 599 vmovl.u8 q14, d16 |
600 vaddw.s8 q14, q14, d4 | |
601 vsubw.s8 q11, q11, d4 | |
602 vqmovun.s16 d16, q14 | |
603 vqmovun.s16 d0, q11 | |
604 .endm | |
605 | |
606 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
607 h264_loop_filter_start | |
608 | |
609 sub r0, r0, r1, lsl #1 | |
610 vld1.64 {d18}, [r0,:64], r1 | |
611 vld1.64 {d16}, [r0,:64], r1 | |
612 vld1.64 {d0}, [r0,:64], r1 | |
613 vld1.64 {d2}, [r0,:64] | |
614 | |
615 h264_loop_filter_chroma | |
616 | |
617 sub r0, r0, r1, lsl #1 | |
618 vst1.64 {d16}, [r0,:64], r1 | |
619 vst1.64 {d0}, [r0,:64], r1 | |
620 | |
621 bx lr | |
11443 | 622 endfunc |
8337 | 623 |
624 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
625 h264_loop_filter_start | |
626 | |
627 sub r0, r0, #2 | |
628 vld1.32 {d18[0]}, [r0], r1 | |
629 vld1.32 {d16[0]}, [r0], r1 | |
630 vld1.32 {d0[0]}, [r0], r1 | |
631 vld1.32 {d2[0]}, [r0], r1 | |
632 vld1.32 {d18[1]}, [r0], r1 | |
633 vld1.32 {d16[1]}, [r0], r1 | |
634 vld1.32 {d0[1]}, [r0], r1 | |
635 vld1.32 {d2[1]}, [r0], r1 | |
636 | |
637 vtrn.16 d18, d0 | |
638 vtrn.16 d16, d2 | |
639 vtrn.8 d18, d16 | |
640 vtrn.8 d0, d2 | |
641 | |
642 h264_loop_filter_chroma | |
643 | |
644 vtrn.16 d18, d0 | |
645 vtrn.16 d16, d2 | |
646 vtrn.8 d18, d16 | |
647 vtrn.8 d0, d2 | |
648 | |
649 sub r0, r0, r1, lsl #3 | |
650 vst1.32 {d18[0]}, [r0], r1 | |
651 vst1.32 {d16[0]}, [r0], r1 | |
652 vst1.32 {d0[0]}, [r0], r1 | |
653 vst1.32 {d2[0]}, [r0], r1 | |
654 vst1.32 {d18[1]}, [r0], r1 | |
655 vst1.32 {d16[1]}, [r0], r1 | |
656 vst1.32 {d0[1]}, [r0], r1 | |
657 vst1.32 {d2[1]}, [r0], r1 | |
658 | |
659 bx lr | |
11443 | 660 endfunc |
8338 | 661 |
662 /* H.264 qpel MC */ | |
663 | |
664 .macro lowpass_const r | |
665 movw \r, #5 | |
666 movt \r, #20 | |
667 vmov.32 d6[0], \r | |
668 .endm | |
669 | |
670 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
671 .if \narrow | |
672 t0 .req q0 | |
673 t1 .req q8 | |
674 .else | |
675 t0 .req \d0 | |
676 t1 .req \d1 | |
677 .endif | |
678 vext.8 d2, \r0, \r1, #2 | |
679 vext.8 d3, \r0, \r1, #3 | |
680 vaddl.u8 q1, d2, d3 | |
681 vext.8 d4, \r0, \r1, #1 | |
682 vext.8 d5, \r0, \r1, #4 | |
683 vaddl.u8 q2, d4, d5 | |
684 vext.8 d30, \r0, \r1, #5 | |
685 vaddl.u8 t0, \r0, d30 | |
686 vext.8 d18, \r2, \r3, #2 | |
687 vmla.i16 t0, q1, d6[1] | |
688 vext.8 d19, \r2, \r3, #3 | |
689 vaddl.u8 q9, d18, d19 | |
690 vext.8 d20, \r2, \r3, #1 | |
691 vmls.i16 t0, q2, d6[0] | |
692 vext.8 d21, \r2, \r3, #4 | |
693 vaddl.u8 q10, d20, d21 | |
694 vext.8 d31, \r2, \r3, #5 | |
695 vaddl.u8 t1, \r2, d31 | |
696 vmla.i16 t1, q9, d6[1] | |
697 vmls.i16 t1, q10, d6[0] | |
698 .if \narrow | |
699 vqrshrun.s16 \d0, t0, #5 | |
700 vqrshrun.s16 \d1, t1, #5 | |
701 .endif | |
702 .unreq t0 | |
703 .unreq t1 | |
704 .endm | |
705 | |
706 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
707 .if \narrow | |
708 t0 .req q0 | |
709 .else | |
710 t0 .req \d0 | |
711 .endif | |
712 vext.8 d2, \r0, \r1, #2 | |
713 vext.8 d3, \r0, \r1, #3 | |
714 vaddl.u8 q1, d2, d3 | |
715 vext.8 d4, \r0, \r1, #1 | |
716 vext.8 d5, \r0, \r1, #4 | |
717 vaddl.u8 q2, d4, d5 | |
718 vext.8 d30, \r0, \r1, #5 | |
719 vaddl.u8 t0, \r0, d30 | |
720 vmla.i16 t0, q1, d6[1] | |
721 vmls.i16 t0, q2, d6[0] | |
722 .if \narrow | |
723 vqrshrun.s16 \d0, t0, #5 | |
724 .endif | |
725 .unreq t0 | |
726 .endm | |
727 | |
728 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
729 vext.16 q1, \r0, \r1, #2 | |
730 vext.16 q0, \r0, \r1, #3 | |
731 vaddl.s16 q9, d2, d0 | |
732 vext.16 q2, \r0, \r1, #1 | |
733 vaddl.s16 q1, d3, d1 | |
734 vext.16 q3, \r0, \r1, #4 | |
735 vaddl.s16 q10, d4, d6 | |
736 vext.16 \r1, \r0, \r1, #5 | |
737 vaddl.s16 q2, d5, d7 | |
738 vaddl.s16 q0, \h0, \h1 | |
739 vaddl.s16 q8, \l0, \l1 | |
740 | |
741 vshl.i32 q3, q9, #4 | |
742 vshl.i32 q9, q9, #2 | |
743 vshl.i32 q15, q10, #2 | |
744 vadd.i32 q9, q9, q3 | |
745 vadd.i32 q10, q10, q15 | |
746 | |
747 vshl.i32 q3, q1, #4 | |
748 vshl.i32 q1, q1, #2 | |
749 vshl.i32 q15, q2, #2 | |
750 vadd.i32 q1, q1, q3 | |
751 vadd.i32 q2, q2, q15 | |
752 | |
753 vadd.i32 q9, q9, q8 | |
754 vsub.i32 q9, q9, q10 | |
755 | |
756 vadd.i32 q1, q1, q0 | |
757 vsub.i32 q1, q1, q2 | |
758 | |
759 vrshrn.s32 d18, q9, #10 | |
760 vrshrn.s32 d19, q1, #10 | |
761 | |
762 vqmovun.s16 \d, q9 | |
763 .endm | |
764 | |
765 function put_h264_qpel16_h_lowpass_neon_packed | |
766 mov r4, lr | |
767 mov ip, #16 | |
768 mov r3, #8 | |
769 bl put_h264_qpel8_h_lowpass_neon | |
770 sub r1, r1, r2, lsl #4 | |
771 add r1, r1, #8 | |
772 mov ip, #16 | |
773 mov lr, r4 | |
774 b put_h264_qpel8_h_lowpass_neon | |
11443 | 775 endfunc |
8338 | 776 |
10616 | 777 .macro h264_qpel_h_lowpass type |
778 function \type\()_h264_qpel16_h_lowpass_neon | |
8338 | 779 push {lr} |
780 mov ip, #16 | |
10616 | 781 bl \type\()_h264_qpel8_h_lowpass_neon |
8338 | 782 sub r0, r0, r3, lsl #4 |
783 sub r1, r1, r2, lsl #4 | |
784 add r0, r0, #8 | |
785 add r1, r1, #8 | |
786 mov ip, #16 | |
787 pop {lr} | |
11443 | 788 endfunc |
8338 | 789 |
10616 | 790 function \type\()_h264_qpel8_h_lowpass_neon |
8338 | 791 1: vld1.64 {d0, d1}, [r1], r2 |
792 vld1.64 {d16,d17}, [r1], r2 | |
793 subs ip, ip, #2 | |
794 lowpass_8 d0, d1, d16, d17, d0, d16 | |
10616 | 795 .ifc \type,avg |
796 vld1.8 {d2}, [r0,:64], r3 | |
797 vrhadd.u8 d0, d0, d2 | |
798 vld1.8 {d3}, [r0,:64] | |
799 vrhadd.u8 d16, d16, d3 | |
800 sub r0, r0, r3 | |
801 .endif | |
8338 | 802 vst1.64 {d0}, [r0,:64], r3 |
803 vst1.64 {d16}, [r0,:64], r3 | |
804 bne 1b | |
805 bx lr | |
11443 | 806 endfunc |
10616 | 807 .endm |
8338 | 808 |
10616 | 809 h264_qpel_h_lowpass put |
810 h264_qpel_h_lowpass avg | |
811 | |
812 .macro h264_qpel_h_lowpass_l2 type | |
813 function \type\()_h264_qpel16_h_lowpass_l2_neon | |
8338 | 814 push {lr} |
815 mov ip, #16 | |
10616 | 816 bl \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 817 sub r0, r0, r2, lsl #4 |
818 sub r1, r1, r2, lsl #4 | |
819 sub r3, r3, r2, lsl #4 | |
820 add r0, r0, #8 | |
821 add r1, r1, #8 | |
822 add r3, r3, #8 | |
823 mov ip, #16 | |
824 pop {lr} | |
11443 | 825 endfunc |
8338 | 826 |
10616 | 827 function \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 828 1: vld1.64 {d0, d1}, [r1], r2 |
829 vld1.64 {d16,d17}, [r1], r2 | |
830 vld1.64 {d28}, [r3], r2 | |
831 vld1.64 {d29}, [r3], r2 | |
832 subs ip, ip, #2 | |
833 lowpass_8 d0, d1, d16, d17, d0, d1 | |
834 vrhadd.u8 q0, q0, q14 | |
10616 | 835 .ifc \type,avg |
836 vld1.8 {d2}, [r0,:64], r2 | |
837 vrhadd.u8 d0, d0, d2 | |
838 vld1.8 {d3}, [r0,:64] | |
839 vrhadd.u8 d1, d1, d3 | |
840 sub r0, r0, r2 | |
841 .endif | |
8338 | 842 vst1.64 {d0}, [r0,:64], r2 |
843 vst1.64 {d1}, [r0,:64], r2 | |
844 bne 1b | |
845 bx lr | |
11443 | 846 endfunc |
10616 | 847 .endm |
848 | |
849 h264_qpel_h_lowpass_l2 put | |
850 h264_qpel_h_lowpass_l2 avg | |
8338 | 851 |
852 function put_h264_qpel16_v_lowpass_neon_packed | |
853 mov r4, lr | |
854 mov r2, #8 | |
855 bl put_h264_qpel8_v_lowpass_neon | |
856 sub r1, r1, r3, lsl #2 | |
857 bl put_h264_qpel8_v_lowpass_neon | |
858 sub r1, r1, r3, lsl #4 | |
859 sub r1, r1, r3, lsl #2 | |
860 add r1, r1, #8 | |
861 bl put_h264_qpel8_v_lowpass_neon | |
862 sub r1, r1, r3, lsl #2 | |
863 mov lr, r4 | |
864 b put_h264_qpel8_v_lowpass_neon | |
11443 | 865 endfunc |
8338 | 866 |
10616 | 867 .macro h264_qpel_v_lowpass type |
868 function \type\()_h264_qpel16_v_lowpass_neon | |
8338 | 869 mov r4, lr |
10616 | 870 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 871 sub r1, r1, r3, lsl #2 |
10616 | 872 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 873 sub r0, r0, r2, lsl #4 |
874 add r0, r0, #8 | |
875 sub r1, r1, r3, lsl #4 | |
876 sub r1, r1, r3, lsl #2 | |
877 add r1, r1, #8 | |
10616 | 878 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 879 sub r1, r1, r3, lsl #2 |
880 mov lr, r4 | |
11443 | 881 endfunc |
8338 | 882 |
10616 | 883 function \type\()_h264_qpel8_v_lowpass_neon |
8338 | 884 vld1.64 {d8}, [r1], r3 |
885 vld1.64 {d10}, [r1], r3 | |
886 vld1.64 {d12}, [r1], r3 | |
887 vld1.64 {d14}, [r1], r3 | |
888 vld1.64 {d22}, [r1], r3 | |
889 vld1.64 {d24}, [r1], r3 | |
890 vld1.64 {d26}, [r1], r3 | |
891 vld1.64 {d28}, [r1], r3 | |
892 vld1.64 {d9}, [r1], r3 | |
893 vld1.64 {d11}, [r1], r3 | |
894 vld1.64 {d13}, [r1], r3 | |
895 vld1.64 {d15}, [r1], r3 | |
896 vld1.64 {d23}, [r1] | |
897 | |
898 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
899 lowpass_8 d8, d9, d10, d11, d8, d10 | |
900 lowpass_8 d12, d13, d14, d15, d12, d14 | |
901 lowpass_8 d22, d23, d24, d25, d22, d24 | |
902 lowpass_8 d26, d27, d28, d29, d26, d28 | |
903 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
904 | |
10616 | 905 .ifc \type,avg |
906 vld1.8 {d9}, [r0,:64], r2 | |
907 vrhadd.u8 d8, d8, d9 | |
908 vld1.8 {d11}, [r0,:64], r2 | |
909 vrhadd.u8 d10, d10, d11 | |
910 vld1.8 {d13}, [r0,:64], r2 | |
911 vrhadd.u8 d12, d12, d13 | |
912 vld1.8 {d15}, [r0,:64], r2 | |
913 vrhadd.u8 d14, d14, d15 | |
914 vld1.8 {d23}, [r0,:64], r2 | |
915 vrhadd.u8 d22, d22, d23 | |
916 vld1.8 {d25}, [r0,:64], r2 | |
917 vrhadd.u8 d24, d24, d25 | |
918 vld1.8 {d27}, [r0,:64], r2 | |
919 vrhadd.u8 d26, d26, d27 | |
920 vld1.8 {d29}, [r0,:64], r2 | |
921 vrhadd.u8 d28, d28, d29 | |
922 sub r0, r0, r2, lsl #3 | |
923 .endif | |
924 | |
8338 | 925 vst1.64 {d8}, [r0,:64], r2 |
926 vst1.64 {d10}, [r0,:64], r2 | |
927 vst1.64 {d12}, [r0,:64], r2 | |
928 vst1.64 {d14}, [r0,:64], r2 | |
929 vst1.64 {d22}, [r0,:64], r2 | |
930 vst1.64 {d24}, [r0,:64], r2 | |
931 vst1.64 {d26}, [r0,:64], r2 | |
932 vst1.64 {d28}, [r0,:64], r2 | |
933 | |
934 bx lr | |
11443 | 935 endfunc |
10616 | 936 .endm |
8338 | 937 |
10616 | 938 h264_qpel_v_lowpass put |
939 h264_qpel_v_lowpass avg | |
940 | |
941 .macro h264_qpel_v_lowpass_l2 type | |
942 function \type\()_h264_qpel16_v_lowpass_l2_neon | |
8338 | 943 mov r4, lr |
10616 | 944 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 945 sub r1, r1, r3, lsl #2 |
10616 | 946 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 947 sub r0, r0, r3, lsl #4 |
948 sub ip, ip, r2, lsl #4 | |
949 add r0, r0, #8 | |
950 add ip, ip, #8 | |
951 sub r1, r1, r3, lsl #4 | |
952 sub r1, r1, r3, lsl #2 | |
953 add r1, r1, #8 | |
10616 | 954 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 955 sub r1, r1, r3, lsl #2 |
956 mov lr, r4 | |
11443 | 957 endfunc |
8338 | 958 |
10616 | 959 function \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 960 vld1.64 {d8}, [r1], r3 |
961 vld1.64 {d10}, [r1], r3 | |
962 vld1.64 {d12}, [r1], r3 | |
963 vld1.64 {d14}, [r1], r3 | |
964 vld1.64 {d22}, [r1], r3 | |
965 vld1.64 {d24}, [r1], r3 | |
966 vld1.64 {d26}, [r1], r3 | |
967 vld1.64 {d28}, [r1], r3 | |
968 vld1.64 {d9}, [r1], r3 | |
969 vld1.64 {d11}, [r1], r3 | |
970 vld1.64 {d13}, [r1], r3 | |
971 vld1.64 {d15}, [r1], r3 | |
972 vld1.64 {d23}, [r1] | |
973 | |
974 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
975 lowpass_8 d8, d9, d10, d11, d8, d9 | |
976 lowpass_8 d12, d13, d14, d15, d12, d13 | |
977 lowpass_8 d22, d23, d24, d25, d22, d23 | |
978 lowpass_8 d26, d27, d28, d29, d26, d27 | |
979 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
980 | |
981 vld1.64 {d0}, [ip], r2 | |
982 vld1.64 {d1}, [ip], r2 | |
983 vld1.64 {d2}, [ip], r2 | |
984 vld1.64 {d3}, [ip], r2 | |
985 vld1.64 {d4}, [ip], r2 | |
986 vrhadd.u8 q0, q0, q4 | |
987 vld1.64 {d5}, [ip], r2 | |
988 vrhadd.u8 q1, q1, q6 | |
989 vld1.64 {d10}, [ip], r2 | |
990 vrhadd.u8 q2, q2, q11 | |
991 vld1.64 {d11}, [ip], r2 | |
10616 | 992 vrhadd.u8 q5, q5, q13 |
993 | |
994 .ifc \type,avg | |
995 vld1.8 {d16}, [r0,:64], r3 | |
996 vrhadd.u8 d0, d0, d16 | |
997 vld1.8 {d17}, [r0,:64], r3 | |
998 vrhadd.u8 d1, d1, d17 | |
999 vld1.8 {d16}, [r0,:64], r3 | |
1000 vrhadd.u8 d2, d2, d16 | |
1001 vld1.8 {d17}, [r0,:64], r3 | |
1002 vrhadd.u8 d3, d3, d17 | |
1003 vld1.8 {d16}, [r0,:64], r3 | |
1004 vrhadd.u8 d4, d4, d16 | |
1005 vld1.8 {d17}, [r0,:64], r3 | |
1006 vrhadd.u8 d5, d5, d17 | |
1007 vld1.8 {d16}, [r0,:64], r3 | |
1008 vrhadd.u8 d10, d10, d16 | |
1009 vld1.8 {d17}, [r0,:64], r3 | |
1010 vrhadd.u8 d11, d11, d17 | |
1011 sub r0, r0, r3, lsl #3 | |
1012 .endif | |
8338 | 1013 |
1014 vst1.64 {d0}, [r0,:64], r3 | |
1015 vst1.64 {d1}, [r0,:64], r3 | |
1016 vst1.64 {d2}, [r0,:64], r3 | |
1017 vst1.64 {d3}, [r0,:64], r3 | |
1018 vst1.64 {d4}, [r0,:64], r3 | |
1019 vst1.64 {d5}, [r0,:64], r3 | |
1020 vst1.64 {d10}, [r0,:64], r3 | |
1021 vst1.64 {d11}, [r0,:64], r3 | |
1022 | |
1023 bx lr | |
11443 | 1024 endfunc |
10616 | 1025 .endm |
1026 | |
1027 h264_qpel_v_lowpass_l2 put | |
1028 h264_qpel_v_lowpass_l2 avg | |
8338 | 1029 |
1030 function put_h264_qpel8_hv_lowpass_neon_top | |
1031 lowpass_const ip | |
1032 mov ip, #12 | |
1033 1: vld1.64 {d0, d1}, [r1], r3 | |
1034 vld1.64 {d16,d17}, [r1], r3 | |
1035 subs ip, ip, #2 | |
1036 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
1037 vst1.64 {d22-d25}, [r4,:128]! | |
1038 bne 1b | |
1039 | |
1040 vld1.64 {d0, d1}, [r1] | |
1041 lowpass_8_1 d0, d1, q12, narrow=0 | |
1042 | |
1043 mov ip, #-16 | |
1044 add r4, r4, ip | |
1045 vld1.64 {d30,d31}, [r4,:128], ip | |
1046 vld1.64 {d20,d21}, [r4,:128], ip | |
1047 vld1.64 {d18,d19}, [r4,:128], ip | |
1048 vld1.64 {d16,d17}, [r4,:128], ip | |
1049 vld1.64 {d14,d15}, [r4,:128], ip | |
1050 vld1.64 {d12,d13}, [r4,:128], ip | |
1051 vld1.64 {d10,d11}, [r4,:128], ip | |
1052 vld1.64 {d8, d9}, [r4,:128], ip | |
1053 vld1.64 {d6, d7}, [r4,:128], ip | |
1054 vld1.64 {d4, d5}, [r4,:128], ip | |
1055 vld1.64 {d2, d3}, [r4,:128], ip | |
1056 vld1.64 {d0, d1}, [r4,:128] | |
1057 | |
1058 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
1059 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
1060 | |
1061 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
1062 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
1063 | |
1064 vst1.64 {d30,d31}, [r4,:128]! | |
1065 vst1.64 {d6, d7}, [r4,:128]! | |
1066 vst1.64 {d20,d21}, [r4,:128]! | |
1067 vst1.64 {d4, d5}, [r4,:128]! | |
1068 vst1.64 {d18,d19}, [r4,:128]! | |
1069 vst1.64 {d2, d3}, [r4,:128]! | |
1070 vst1.64 {d16,d17}, [r4,:128]! | |
1071 vst1.64 {d0, d1}, [r4,:128] | |
1072 | |
1073 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
1074 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
1075 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
1076 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
1077 | |
1078 vld1.64 {d16,d17}, [r4,:128], ip | |
1079 vld1.64 {d30,d31}, [r4,:128], ip | |
1080 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
1081 vld1.64 {d16,d17}, [r4,:128], ip | |
1082 vld1.64 {d30,d31}, [r4,:128], ip | |
1083 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
1084 vld1.64 {d16,d17}, [r4,:128], ip | |
1085 vld1.64 {d30,d31}, [r4,:128], ip | |
1086 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
1087 vld1.64 {d16,d17}, [r4,:128], ip | |
1088 vld1.64 {d30,d31}, [r4,:128] | |
1089 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
1090 | |
1091 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
1092 | |
1093 bx lr | |
11443 | 1094 endfunc |
8338 | 1095 |
10616 | 1096 .macro h264_qpel8_hv_lowpass type |
1097 function \type\()_h264_qpel8_hv_lowpass_neon | |
8338 | 1098 mov r10, lr |
1099 bl put_h264_qpel8_hv_lowpass_neon_top | |
10616 | 1100 .ifc \type,avg |
1101 vld1.8 {d0}, [r0,:64], r2 | |
1102 vrhadd.u8 d12, d12, d0 | |
1103 vld1.8 {d1}, [r0,:64], r2 | |
1104 vrhadd.u8 d13, d13, d1 | |
1105 vld1.8 {d2}, [r0,:64], r2 | |
1106 vrhadd.u8 d14, d14, d2 | |
1107 vld1.8 {d3}, [r0,:64], r2 | |
1108 vrhadd.u8 d15, d15, d3 | |
1109 vld1.8 {d4}, [r0,:64], r2 | |
1110 vrhadd.u8 d8, d8, d4 | |
1111 vld1.8 {d5}, [r0,:64], r2 | |
1112 vrhadd.u8 d9, d9, d5 | |
1113 vld1.8 {d6}, [r0,:64], r2 | |
1114 vrhadd.u8 d10, d10, d6 | |
1115 vld1.8 {d7}, [r0,:64], r2 | |
1116 vrhadd.u8 d11, d11, d7 | |
1117 sub r0, r0, r2, lsl #3 | |
1118 .endif | |
8338 | 1119 vst1.64 {d12}, [r0,:64], r2 |
1120 vst1.64 {d13}, [r0,:64], r2 | |
1121 vst1.64 {d14}, [r0,:64], r2 | |
1122 vst1.64 {d15}, [r0,:64], r2 | |
1123 vst1.64 {d8}, [r0,:64], r2 | |
1124 vst1.64 {d9}, [r0,:64], r2 | |
1125 vst1.64 {d10}, [r0,:64], r2 | |
1126 vst1.64 {d11}, [r0,:64], r2 | |
1127 | |
1128 mov lr, r10 | |
1129 bx lr | |
11443 | 1130 endfunc |
10616 | 1131 .endm |
8338 | 1132 |
10616 | 1133 h264_qpel8_hv_lowpass put |
1134 h264_qpel8_hv_lowpass avg | |
1135 | |
1136 .macro h264_qpel8_hv_lowpass_l2 type | |
1137 function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
8338 | 1138 mov r10, lr |
1139 bl put_h264_qpel8_hv_lowpass_neon_top | |
1140 | |
1141 vld1.64 {d0, d1}, [r2,:128]! | |
1142 vld1.64 {d2, d3}, [r2,:128]! | |
1143 vrhadd.u8 q0, q0, q6 | |
1144 vld1.64 {d4, d5}, [r2,:128]! | |
1145 vrhadd.u8 q1, q1, q7 | |
1146 vld1.64 {d6, d7}, [r2,:128]! | |
1147 vrhadd.u8 q2, q2, q4 | |
10616 | 1148 vrhadd.u8 q3, q3, q5 |
1149 .ifc \type,avg | |
1150 vld1.8 {d16}, [r0,:64], r3 | |
1151 vrhadd.u8 d0, d0, d16 | |
1152 vld1.8 {d17}, [r0,:64], r3 | |
1153 vrhadd.u8 d1, d1, d17 | |
1154 vld1.8 {d18}, [r0,:64], r3 | |
1155 vrhadd.u8 d2, d2, d18 | |
1156 vld1.8 {d19}, [r0,:64], r3 | |
1157 vrhadd.u8 d3, d3, d19 | |
1158 vld1.8 {d20}, [r0,:64], r3 | |
1159 vrhadd.u8 d4, d4, d20 | |
1160 vld1.8 {d21}, [r0,:64], r3 | |
1161 vrhadd.u8 d5, d5, d21 | |
1162 vld1.8 {d22}, [r0,:64], r3 | |
1163 vrhadd.u8 d6, d6, d22 | |
1164 vld1.8 {d23}, [r0,:64], r3 | |
1165 vrhadd.u8 d7, d7, d23 | |
1166 sub r0, r0, r3, lsl #3 | |
1167 .endif | |
8338 | 1168 vst1.64 {d0}, [r0,:64], r3 |
1169 vst1.64 {d1}, [r0,:64], r3 | |
1170 vst1.64 {d2}, [r0,:64], r3 | |
1171 vst1.64 {d3}, [r0,:64], r3 | |
1172 vst1.64 {d4}, [r0,:64], r3 | |
1173 vst1.64 {d5}, [r0,:64], r3 | |
1174 vst1.64 {d6}, [r0,:64], r3 | |
1175 vst1.64 {d7}, [r0,:64], r3 | |
1176 | |
1177 mov lr, r10 | |
1178 bx lr | |
11443 | 1179 endfunc |
10616 | 1180 .endm |
8338 | 1181 |
10616 | 1182 h264_qpel8_hv_lowpass_l2 put |
1183 h264_qpel8_hv_lowpass_l2 avg | |
1184 | |
1185 .macro h264_qpel16_hv type | |
1186 function \type\()_h264_qpel16_hv_lowpass_neon | |
8338 | 1187 mov r9, lr |
10616 | 1188 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1189 sub r1, r1, r3, lsl #2 |
10616 | 1190 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1191 sub r1, r1, r3, lsl #4 |
1192 sub r1, r1, r3, lsl #2 | |
1193 add r1, r1, #8 | |
1194 sub r0, r0, r2, lsl #4 | |
1195 add r0, r0, #8 | |
10616 | 1196 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1197 sub r1, r1, r3, lsl #2 |
1198 mov lr, r9 | |
10616 | 1199 b \type\()_h264_qpel8_hv_lowpass_neon |
11443 | 1200 endfunc |
8338 | 1201 |
10616 | 1202 function \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1203 mov r9, lr |
1204 sub r2, r4, #256 | |
10616 | 1205 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1206 sub r1, r1, r3, lsl #2 |
10616 | 1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1208 sub r1, r1, r3, lsl #4 |
1209 sub r1, r1, r3, lsl #2 | |
1210 add r1, r1, #8 | |
1211 sub r0, r0, r3, lsl #4 | |
1212 add r0, r0, #8 | |
10616 | 1213 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1214 sub r1, r1, r3, lsl #2 |
1215 mov lr, r9 | |
10616 | 1216 b \type\()_h264_qpel8_hv_lowpass_l2_neon |
11443 | 1217 endfunc |
10616 | 1218 .endm |
8338 | 1219 |
10616 | 1220 h264_qpel16_hv put |
1221 h264_qpel16_hv avg | |
1222 | |
1223 .macro h264_qpel8 type | |
1224 function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
8338 | 1225 lowpass_const r3 |
1226 mov r3, r1 | |
1227 sub r1, r1, #2 | |
1228 mov ip, #8 | |
10616 | 1229 b \type\()_h264_qpel8_h_lowpass_l2_neon |
11443 | 1230 endfunc |
8338 | 1231 |
10616 | 1232 function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
8338 | 1233 lowpass_const r3 |
1234 sub r1, r1, #2 | |
1235 mov r3, r2 | |
1236 mov ip, #8 | |
10616 | 1237 b \type\()_h264_qpel8_h_lowpass_neon |
11443 | 1238 endfunc |
8338 | 1239 |
10616 | 1240 function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
8338 | 1241 lowpass_const r3 |
1242 add r3, r1, #1 | |
1243 sub r1, r1, #2 | |
1244 mov ip, #8 | |
10616 | 1245 b \type\()_h264_qpel8_h_lowpass_l2_neon |
11443 | 1246 endfunc |
8338 | 1247 |
10616 | 1248 function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
8338 | 1249 push {lr} |
1250 mov ip, r1 | |
10616 | 1251 \type\()_h264_qpel8_mc01: |
8338 | 1252 lowpass_const r3 |
1253 mov r3, r2 | |
1254 sub r1, r1, r2, lsl #1 | |
1255 vpush {d8-d15} | |
10616 | 1256 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1257 vpop {d8-d15} |
1258 pop {pc} | |
11443 | 1259 endfunc |
8338 | 1260 |
10616 | 1261 function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
10385 | 1262 push {r0, r1, r11, lr} |
10616 | 1263 \type\()_h264_qpel8_mc11: |
8338 | 1264 lowpass_const r3 |
10385 | 1265 mov r11, sp |
1266 bic sp, sp, #15 | |
8338 | 1267 sub sp, sp, #64 |
1268 mov r0, sp | |
1269 sub r1, r1, #2 | |
1270 mov r3, #8 | |
1271 mov ip, #8 | |
1272 vpush {d8-d15} | |
1273 bl put_h264_qpel8_h_lowpass_neon | |
10385 | 1274 ldrd r0, [r11] |
8338 | 1275 mov r3, r2 |
1276 add ip, sp, #64 | |
1277 sub r1, r1, r2, lsl #1 | |
1278 mov r2, #8 | |
10616 | 1279 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1280 vpop {d8-d15} |
10385 | 1281 add sp, r11, #8 |
1282 pop {r11, pc} | |
11443 | 1283 endfunc |
8338 | 1284 |
10616 | 1285 function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
8338 | 1286 push {r0, r1, r4, r10, r11, lr} |
10616 | 1287 \type\()_h264_qpel8_mc21: |
8338 | 1288 lowpass_const r3 |
1289 mov r11, sp | |
1290 bic sp, sp, #15 | |
1291 sub sp, sp, #(8*8+16*12) | |
1292 sub r1, r1, #2 | |
1293 mov r3, #8 | |
1294 mov r0, sp | |
1295 mov ip, #8 | |
1296 vpush {d8-d15} | |
1297 bl put_h264_qpel8_h_lowpass_neon | |
1298 mov r4, r0 | |
1299 ldrd r0, [r11] | |
1300 sub r1, r1, r2, lsl #1 | |
1301 sub r1, r1, #2 | |
1302 mov r3, r2 | |
1303 sub r2, r4, #64 | |
10616 | 1304 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1305 vpop {d8-d15} |
1306 add sp, r11, #8 | |
1307 pop {r4, r10, r11, pc} | |
11443 | 1308 endfunc |
8338 | 1309 |
10616 | 1310 function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
8338 | 1311 add r1, r1, #1 |
10385 | 1312 push {r0, r1, r11, lr} |
8338 | 1313 sub r1, r1, #1 |
10616 | 1314 b \type\()_h264_qpel8_mc11 |
11443 | 1315 endfunc |
8338 | 1316 |
10616 | 1317 function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
8338 | 1318 push {lr} |
1319 lowpass_const r3 | |
1320 sub r1, r1, r2, lsl #1 | |
1321 mov r3, r2 | |
1322 vpush {d8-d15} | |
10616 | 1323 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 1324 vpop {d8-d15} |
1325 pop {pc} | |
11443 | 1326 endfunc |
8338 | 1327 |
10616 | 1328 function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
8338 | 1329 push {r0, r1, r4, r10, r11, lr} |
10616 | 1330 \type\()_h264_qpel8_mc12: |
8338 | 1331 lowpass_const r3 |
1332 mov r11, sp | |
1333 bic sp, sp, #15 | |
1334 sub sp, sp, #(8*8+16*12) | |
1335 sub r1, r1, r2, lsl #1 | |
1336 mov r3, r2 | |
1337 mov r2, #8 | |
1338 mov r0, sp | |
1339 vpush {d8-d15} | |
1340 bl put_h264_qpel8_v_lowpass_neon | |
1341 mov r4, r0 | |
1342 ldrd r0, [r11] | |
1343 sub r1, r1, r3, lsl #1 | |
1344 sub r1, r1, #2 | |
1345 sub r2, r4, #64 | |
10616 | 1346 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1347 vpop {d8-d15} |
1348 add sp, r11, #8 | |
1349 pop {r4, r10, r11, pc} | |
11443 | 1350 endfunc |
8338 | 1351 |
10616 | 1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
8338 | 1353 push {r4, r10, r11, lr} |
1354 mov r11, sp | |
1355 bic sp, sp, #15 | |
1356 sub r1, r1, r2, lsl #1 | |
1357 sub r1, r1, #2 | |
1358 mov r3, r2 | |
1359 sub sp, sp, #(16*12) | |
1360 mov r4, sp | |
1361 vpush {d8-d15} | |
10616 | 1362 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1363 vpop {d8-d15} |
1364 mov sp, r11 | |
1365 pop {r4, r10, r11, pc} | |
11443 | 1366 endfunc |
8338 | 1367 |
10616 | 1368 function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
8338 | 1369 push {r0, r1, r4, r10, r11, lr} |
1370 add r1, r1, #1 | |
10616 | 1371 b \type\()_h264_qpel8_mc12 |
11443 | 1372 endfunc |
8338 | 1373 |
10616 | 1374 function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
8338 | 1375 push {lr} |
1376 add ip, r1, r2 | |
10616 | 1377 b \type\()_h264_qpel8_mc01 |
11443 | 1378 endfunc |
8338 | 1379 |
10616 | 1380 function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
10385 | 1381 push {r0, r1, r11, lr} |
8338 | 1382 add r1, r1, r2 |
10616 | 1383 b \type\()_h264_qpel8_mc11 |
11443 | 1384 endfunc |
8338 | 1385 |
10616 | 1386 function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
8338 | 1387 push {r0, r1, r4, r10, r11, lr} |
1388 add r1, r1, r2 | |
10616 | 1389 b \type\()_h264_qpel8_mc21 |
11443 | 1390 endfunc |
8338 | 1391 |
10616 | 1392 function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
8338 | 1393 add r1, r1, #1 |
10385 | 1394 push {r0, r1, r11, lr} |
8338 | 1395 add r1, r1, r2 |
1396 sub r1, r1, #1 | |
10616 | 1397 b \type\()_h264_qpel8_mc11 |
11443 | 1398 endfunc |
10616 | 1399 .endm |
8338 | 1400 |
10616 | 1401 h264_qpel8 put |
1402 h264_qpel8 avg | |
1403 | |
1404 .macro h264_qpel16 type | |
1405 function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
8338 | 1406 lowpass_const r3 |
1407 mov r3, r1 | |
1408 sub r1, r1, #2 | |
10616 | 1409 b \type\()_h264_qpel16_h_lowpass_l2_neon |
11443 | 1410 endfunc |
8338 | 1411 |
10616 | 1412 function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
8338 | 1413 lowpass_const r3 |
1414 sub r1, r1, #2 | |
1415 mov r3, r2 | |
10616 | 1416 b \type\()_h264_qpel16_h_lowpass_neon |
11443 | 1417 endfunc |
8338 | 1418 |
10616 | 1419 function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
8338 | 1420 lowpass_const r3 |
1421 add r3, r1, #1 | |
1422 sub r1, r1, #2 | |
10616 | 1423 b \type\()_h264_qpel16_h_lowpass_l2_neon |
11443 | 1424 endfunc |
8338 | 1425 |
10616 | 1426 function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
8338 | 1427 push {r4, lr} |
1428 mov ip, r1 | |
10616 | 1429 \type\()_h264_qpel16_mc01: |
8338 | 1430 lowpass_const r3 |
1431 mov r3, r2 | |
1432 sub r1, r1, r2, lsl #1 | |
1433 vpush {d8-d15} | |
10616 | 1434 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1435 vpop {d8-d15} |
1436 pop {r4, pc} | |
11443 | 1437 endfunc |
8338 | 1438 |
10616 | 1439 function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
10385 | 1440 push {r0, r1, r4, r11, lr} |
10616 | 1441 \type\()_h264_qpel16_mc11: |
8338 | 1442 lowpass_const r3 |
10385 | 1443 mov r11, sp |
1444 bic sp, sp, #15 | |
8338 | 1445 sub sp, sp, #256 |
1446 mov r0, sp | |
1447 sub r1, r1, #2 | |
1448 mov r3, #16 | |
1449 vpush {d8-d15} | |
1450 bl put_h264_qpel16_h_lowpass_neon | |
10385 | 1451 ldrd r0, [r11] |
8338 | 1452 mov r3, r2 |
1453 add ip, sp, #64 | |
1454 sub r1, r1, r2, lsl #1 | |
1455 mov r2, #16 | |
10616 | 1456 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1457 vpop {d8-d15} |
10385 | 1458 add sp, r11, #8 |
1459 pop {r4, r11, pc} | |
11443 | 1460 endfunc |
8338 | 1461 |
10616 | 1462 function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
8338 | 1463 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1464 \type\()_h264_qpel16_mc21: |
8338 | 1465 lowpass_const r3 |
1466 mov r11, sp | |
1467 bic sp, sp, #15 | |
1468 sub sp, sp, #(16*16+16*12) | |
1469 sub r1, r1, #2 | |
1470 mov r0, sp | |
1471 vpush {d8-d15} | |
1472 bl put_h264_qpel16_h_lowpass_neon_packed | |
1473 mov r4, r0 | |
1474 ldrd r0, [r11] | |
1475 sub r1, r1, r2, lsl #1 | |
1476 sub r1, r1, #2 | |
1477 mov r3, r2 | |
10616 | 1478 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1479 vpop {d8-d15} |
1480 add sp, r11, #8 | |
1481 pop {r4-r5, r9-r11, pc} | |
11443 | 1482 endfunc |
8338 | 1483 |
10616 | 1484 function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
8338 | 1485 add r1, r1, #1 |
10385 | 1486 push {r0, r1, r4, r11, lr} |
8338 | 1487 sub r1, r1, #1 |
10616 | 1488 b \type\()_h264_qpel16_mc11 |
11443 | 1489 endfunc |
8338 | 1490 |
10616 | 1491 function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
8338 | 1492 push {r4, lr} |
1493 lowpass_const r3 | |
1494 sub r1, r1, r2, lsl #1 | |
1495 mov r3, r2 | |
1496 vpush {d8-d15} | |
10616 | 1497 bl \type\()_h264_qpel16_v_lowpass_neon |
8338 | 1498 vpop {d8-d15} |
1499 pop {r4, pc} | |
11443 | 1500 endfunc |
8338 | 1501 |
10616 | 1502 function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
8338 | 1503 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1504 \type\()_h264_qpel16_mc12: |
8338 | 1505 lowpass_const r3 |
1506 mov r11, sp | |
1507 bic sp, sp, #15 | |
1508 sub sp, sp, #(16*16+16*12) | |
1509 sub r1, r1, r2, lsl #1 | |
1510 mov r0, sp | |
1511 mov r3, r2 | |
1512 vpush {d8-d15} | |
1513 bl put_h264_qpel16_v_lowpass_neon_packed | |
1514 mov r4, r0 | |
1515 ldrd r0, [r11] | |
1516 sub r1, r1, r3, lsl #1 | |
1517 sub r1, r1, #2 | |
1518 mov r2, r3 | |
10616 | 1519 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1520 vpop {d8-d15} |
1521 add sp, r11, #8 | |
1522 pop {r4-r5, r9-r11, pc} | |
11443 | 1523 endfunc |
8338 | 1524 |
10616 | 1525 function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
8338 | 1526 push {r4, r9-r11, lr} |
1527 lowpass_const r3 | |
1528 mov r11, sp | |
1529 bic sp, sp, #15 | |
1530 sub r1, r1, r2, lsl #1 | |
1531 sub r1, r1, #2 | |
1532 mov r3, r2 | |
1533 sub sp, sp, #(16*12) | |
1534 mov r4, sp | |
1535 vpush {d8-d15} | |
10616 | 1536 bl \type\()_h264_qpel16_hv_lowpass_neon |
8338 | 1537 vpop {d8-d15} |
1538 mov sp, r11 | |
1539 pop {r4, r9-r11, pc} | |
11443 | 1540 endfunc |
8338 | 1541 |
10616 | 1542 function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
8338 | 1543 push {r0, r1, r4-r5, r9-r11, lr} |
1544 add r1, r1, #1 | |
10616 | 1545 b \type\()_h264_qpel16_mc12 |
11443 | 1546 endfunc |
8338 | 1547 |
10616 | 1548 function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
8338 | 1549 push {r4, lr} |
1550 add ip, r1, r2 | |
10616 | 1551 b \type\()_h264_qpel16_mc01 |
11443 | 1552 endfunc |
8338 | 1553 |
10616 | 1554 function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
10385 | 1555 push {r0, r1, r4, r11, lr} |
8338 | 1556 add r1, r1, r2 |
10616 | 1557 b \type\()_h264_qpel16_mc11 |
11443 | 1558 endfunc |
8338 | 1559 |
10616 | 1560 function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
8338 | 1561 push {r0, r1, r4-r5, r9-r11, lr} |
1562 add r1, r1, r2 | |
10616 | 1563 b \type\()_h264_qpel16_mc21 |
11443 | 1564 endfunc |
8338 | 1565 |
10616 | 1566 function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
8338 | 1567 add r1, r1, #1 |
10385 | 1568 push {r0, r1, r4, r11, lr} |
8338 | 1569 add r1, r1, r2 |
1570 sub r1, r1, #1 | |
10616 | 1571 b \type\()_h264_qpel16_mc11 |
11443 | 1572 endfunc |
10616 | 1573 .endm |
1574 | |
1575 h264_qpel16 put | |
1576 h264_qpel16 avg | |
8663 | 1577 |
1578 @ Biweighted prediction | |
1579 | |
1580 .macro biweight_16 macs, macd | |
1581 vdup.8 d0, r4 | |
1582 vdup.8 d1, r5 | |
1583 vmov q2, q8 | |
1584 vmov q3, q8 | |
1585 1: subs ip, ip, #2 | |
1586 vld1.8 {d20-d21},[r0,:128], r2 | |
1587 \macd q2, d0, d20 | |
1588 pld [r0] | |
1589 \macd q3, d0, d21 | |
1590 vld1.8 {d22-d23},[r1,:128], r2 | |
1591 \macs q2, d1, d22 | |
1592 pld [r1] | |
1593 \macs q3, d1, d23 | |
1594 vmov q12, q8 | |
1595 vld1.8 {d28-d29},[r0,:128], r2 | |
1596 vmov q13, q8 | |
1597 \macd q12, d0, d28 | |
1598 pld [r0] | |
1599 \macd q13, d0, d29 | |
1600 vld1.8 {d30-d31},[r1,:128], r2 | |
1601 \macs q12, d1, d30 | |
1602 pld [r1] | |
1603 \macs q13, d1, d31 | |
1604 vshl.s16 q2, q2, q9 | |
1605 vshl.s16 q3, q3, q9 | |
1606 vqmovun.s16 d4, q2 | |
1607 vqmovun.s16 d5, q3 | |
1608 vshl.s16 q12, q12, q9 | |
1609 vshl.s16 q13, q13, q9 | |
1610 vqmovun.s16 d24, q12 | |
1611 vqmovun.s16 d25, q13 | |
1612 vmov q3, q8 | |
1613 vst1.8 {d4- d5}, [r6,:128], r2 | |
1614 vmov q2, q8 | |
1615 vst1.8 {d24-d25},[r6,:128], r2 | |
1616 bne 1b | |
1617 pop {r4-r6, pc} | |
1618 .endm | |
1619 | |
1620 .macro biweight_8 macs, macd | |
1621 vdup.8 d0, r4 | |
1622 vdup.8 d1, r5 | |
1623 vmov q1, q8 | |
1624 vmov q10, q8 | |
1625 1: subs ip, ip, #2 | |
1626 vld1.8 {d4},[r0,:64], r2 | |
1627 \macd q1, d0, d4 | |
1628 pld [r0] | |
1629 vld1.8 {d5},[r1,:64], r2 | |
1630 \macs q1, d1, d5 | |
1631 pld [r1] | |
1632 vld1.8 {d6},[r0,:64], r2 | |
1633 \macd q10, d0, d6 | |
1634 pld [r0] | |
1635 vld1.8 {d7},[r1,:64], r2 | |
1636 \macs q10, d1, d7 | |
1637 pld [r1] | |
1638 vshl.s16 q1, q1, q9 | |
1639 vqmovun.s16 d2, q1 | |
1640 vshl.s16 q10, q10, q9 | |
1641 vqmovun.s16 d4, q10 | |
1642 vmov q10, q8 | |
1643 vst1.8 {d2},[r6,:64], r2 | |
1644 vmov q1, q8 | |
1645 vst1.8 {d4},[r6,:64], r2 | |
1646 bne 1b | |
1647 pop {r4-r6, pc} | |
1648 .endm | |
1649 | |
1650 .macro biweight_4 macs, macd | |
1651 vdup.8 d0, r4 | |
1652 vdup.8 d1, r5 | |
1653 vmov q1, q8 | |
1654 vmov q10, q8 | |
1655 1: subs ip, ip, #4 | |
1656 vld1.32 {d4[0]},[r0,:32], r2 | |
1657 vld1.32 {d4[1]},[r0,:32], r2 | |
1658 \macd q1, d0, d4 | |
1659 pld [r0] | |
1660 vld1.32 {d5[0]},[r1,:32], r2 | |
1661 vld1.32 {d5[1]},[r1,:32], r2 | |
1662 \macs q1, d1, d5 | |
1663 pld [r1] | |
1664 blt 2f | |
1665 vld1.32 {d6[0]},[r0,:32], r2 | |
1666 vld1.32 {d6[1]},[r0,:32], r2 | |
1667 \macd q10, d0, d6 | |
1668 pld [r0] | |
1669 vld1.32 {d7[0]},[r1,:32], r2 | |
1670 vld1.32 {d7[1]},[r1,:32], r2 | |
1671 \macs q10, d1, d7 | |
1672 pld [r1] | |
1673 vshl.s16 q1, q1, q9 | |
1674 vqmovun.s16 d2, q1 | |
1675 vshl.s16 q10, q10, q9 | |
1676 vqmovun.s16 d4, q10 | |
1677 vmov q10, q8 | |
1678 vst1.32 {d2[0]},[r6,:32], r2 | |
1679 vst1.32 {d2[1]},[r6,:32], r2 | |
1680 vmov q1, q8 | |
1681 vst1.32 {d4[0]},[r6,:32], r2 | |
1682 vst1.32 {d4[1]},[r6,:32], r2 | |
1683 bne 1b | |
1684 pop {r4-r6, pc} | |
1685 2: vshl.s16 q1, q1, q9 | |
1686 vqmovun.s16 d2, q1 | |
1687 vst1.32 {d2[0]},[r6,:32], r2 | |
1688 vst1.32 {d2[1]},[r6,:32], r2 | |
1689 pop {r4-r6, pc} | |
1690 .endm | |
1691 | |
1692 .macro biweight_func w | |
1693 function biweight_h264_pixels_\w\()_neon | |
1694 push {r4-r6, lr} | |
1695 add r4, sp, #16 | |
1696 ldm r4, {r4-r6} | |
1697 lsr lr, r4, #31 | |
1698 add r6, r6, #1 | |
1699 eors lr, lr, r5, lsr #30 | |
1700 orr r6, r6, #1 | |
1701 vdup.16 q9, r3 | |
1702 lsl r6, r6, r3 | |
1703 vmvn q9, q9 | |
1704 vdup.16 q8, r6 | |
1705 mov r6, r0 | |
1706 beq 10f | |
1707 subs lr, lr, #1 | |
1708 beq 20f | |
1709 subs lr, lr, #1 | |
1710 beq 30f | |
1711 b 40f | |
1712 10: biweight_\w vmlal.u8, vmlal.u8 | |
1713 20: rsb r4, r4, #0 | |
1714 biweight_\w vmlal.u8, vmlsl.u8 | |
1715 30: rsb r4, r4, #0 | |
1716 rsb r5, r5, #0 | |
1717 biweight_\w vmlsl.u8, vmlsl.u8 | |
1718 40: rsb r5, r5, #0 | |
1719 biweight_\w vmlsl.u8, vmlal.u8 | |
11443 | 1720 endfunc |
8663 | 1721 .endm |
1722 | |
1723 .macro biweight_entry w, h, b=1 | |
1724 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1725 mov ip, #\h | |
1726 .if \b | |
1727 b biweight_h264_pixels_\w\()_neon | |
1728 .endif | |
11443 | 1729 endfunc |
8663 | 1730 .endm |
1731 | |
1732 biweight_entry 16, 8 | |
1733 biweight_entry 16, 16, b=0 | |
1734 biweight_func 16 | |
1735 | |
1736 biweight_entry 8, 16 | |
1737 biweight_entry 8, 4 | |
1738 biweight_entry 8, 8, b=0 | |
1739 biweight_func 8 | |
1740 | |
1741 biweight_entry 4, 8 | |
1742 biweight_entry 4, 2 | |
1743 biweight_entry 4, 4, b=0 | |
1744 biweight_func 4 | |
8664 | 1745 |
1746 @ Weighted prediction | |
1747 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1748 .macro weight_16 add |
8664 | 1749 vdup.8 d0, r3 |
1750 1: subs ip, ip, #2 | |
1751 vld1.8 {d20-d21},[r0,:128], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1752 vmull.u8 q2, d0, d20 |
8664 | 1753 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1754 vmull.u8 q3, d0, d21 |
8664 | 1755 vld1.8 {d28-d29},[r0,:128], r1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1756 vmull.u8 q12, d0, d28 |
8664 | 1757 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1758 vmull.u8 q13, d0, d29 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1759 \add q2, q8, q2 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1760 vrshl.s16 q2, q2, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1761 \add q3, q8, q3 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1762 vrshl.s16 q3, q3, q9 |
8664 | 1763 vqmovun.s16 d4, q2 |
1764 vqmovun.s16 d5, q3 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1765 \add q12, q8, q12 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1766 vrshl.s16 q12, q12, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1767 \add q13, q8, q13 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1768 vrshl.s16 q13, q13, q9 |
8664 | 1769 vqmovun.s16 d24, q12 |
1770 vqmovun.s16 d25, q13 | |
1771 vst1.8 {d4- d5}, [r4,:128], r1 | |
1772 vst1.8 {d24-d25},[r4,:128], r1 | |
1773 bne 1b | |
1774 pop {r4, pc} | |
1775 .endm | |
1776 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1777 .macro weight_8 add |
8664 | 1778 vdup.8 d0, r3 |
1779 1: subs ip, ip, #2 | |
1780 vld1.8 {d4},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1781 vmull.u8 q1, d0, d4 |
8664 | 1782 pld [r0] |
1783 vld1.8 {d6},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1784 vmull.u8 q10, d0, d6 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1785 \add q1, q8, q1 |
8664 | 1786 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1787 vrshl.s16 q1, q1, q9 |
8664 | 1788 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1789 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1790 vrshl.s16 q10, q10, q9 |
8664 | 1791 vqmovun.s16 d4, q10 |
1792 vst1.8 {d2},[r4,:64], r1 | |
1793 vst1.8 {d4},[r4,:64], r1 | |
1794 bne 1b | |
1795 pop {r4, pc} | |
1796 .endm | |
1797 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1798 .macro weight_4 add |
8664 | 1799 vdup.8 d0, r3 |
1800 vmov q1, q8 | |
1801 vmov q10, q8 | |
1802 1: subs ip, ip, #4 | |
1803 vld1.32 {d4[0]},[r0,:32], r1 | |
1804 vld1.32 {d4[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1805 vmull.u8 q1, d0, d4 |
8664 | 1806 pld [r0] |
1807 blt 2f | |
1808 vld1.32 {d6[0]},[r0,:32], r1 | |
1809 vld1.32 {d6[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1810 vmull.u8 q10, d0, d6 |
8664 | 1811 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1812 \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1813 vrshl.s16 q1, q1, q9 |
8664 | 1814 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1815 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1816 vrshl.s16 q10, q10, q9 |
8664 | 1817 vqmovun.s16 d4, q10 |
1818 vmov q10, q8 | |
1819 vst1.32 {d2[0]},[r4,:32], r1 | |
1820 vst1.32 {d2[1]},[r4,:32], r1 | |
1821 vmov q1, q8 | |
1822 vst1.32 {d4[0]},[r4,:32], r1 | |
1823 vst1.32 {d4[1]},[r4,:32], r1 | |
1824 bne 1b | |
1825 pop {r4, pc} | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1826 2: \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1827 vrshl.s16 q1, q1, q9 |
8664 | 1828 vqmovun.s16 d2, q1 |
1829 vst1.32 {d2[0]},[r4,:32], r1 | |
1830 vst1.32 {d2[1]},[r4,:32], r1 | |
1831 pop {r4, pc} | |
1832 .endm | |
1833 | |
1834 .macro weight_func w | |
1835 function weight_h264_pixels_\w\()_neon | |
1836 push {r4, lr} | |
1837 ldr r4, [sp, #8] | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1838 cmp r2, #1 |
8664 | 1839 lsl r4, r4, r2 |
1840 vdup.16 q8, r4 | |
1841 mov r4, r0 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1842 ble 20f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1843 rsb lr, r2, #1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1844 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1845 cmp r3, #0 |
8664 | 1846 blt 10f |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1847 weight_\w vhadd.s16 |
8664 | 1848 10: rsb r3, r3, #0 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1849 weight_\w vhsub.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1850 20: rsb lr, r2, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1851 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1852 cmp r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1853 blt 10f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1854 weight_\w vadd.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1855 10: rsb r3, r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1856 weight_\w vsub.s16 |
11443 | 1857 endfunc |
8664 | 1858 .endm |
1859 | |
1860 .macro weight_entry w, h, b=1 | |
1861 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1862 mov ip, #\h | |
1863 .if \b | |
1864 b weight_h264_pixels_\w\()_neon | |
1865 .endif | |
11443 | 1866 endfunc |
8664 | 1867 .endm |
1868 | |
1869 weight_entry 16, 8 | |
1870 weight_entry 16, 16, b=0 | |
1871 weight_func 16 | |
1872 | |
1873 weight_entry 8, 16 | |
1874 weight_entry 8, 4 | |
1875 weight_entry 8, 8, b=0 | |
1876 weight_func 8 | |
1877 | |
1878 weight_entry 4, 8 | |
1879 weight_entry 4, 2 | |
1880 weight_entry 4, 4, b=0 | |
1881 weight_func 4 |