Mercurial > libavcodec.hg
annotate arm/h264dsp_neon.S @ 12207:44d480f49053 libavcodec
Add header declarations for mmx/sse constants missing them
author | conrad |
---|---|
date | Wed, 21 Jul 2010 10:02:07 +0000 |
parents | 69bbfd8f2ba5 |
children |
rev | line source |
---|---|
8336 | 1 /* |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
8338 | 23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 |
24 vtrn.32 \r0, \r4 | |
25 vtrn.32 \r1, \r5 | |
26 vtrn.32 \r2, \r6 | |
27 vtrn.32 \r3, \r7 | |
28 vtrn.16 \r0, \r2 | |
29 vtrn.16 \r1, \r3 | |
30 vtrn.16 \r4, \r6 | |
31 vtrn.16 \r5, \r7 | |
32 vtrn.8 \r0, \r1 | |
33 vtrn.8 \r2, \r3 | |
34 vtrn.8 \r4, \r5 | |
35 vtrn.8 \r6, \r7 | |
36 .endm | |
37 | |
9864 | 38 .macro transpose_4x4 r0 r1 r2 r3 |
39 vtrn.16 \r0, \r2 | |
40 vtrn.16 \r1, \r3 | |
41 vtrn.8 \r0, \r1 | |
42 vtrn.8 \r2, \r3 | |
43 .endm | |
44 | |
8338 | 45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 |
46 vswp \r0, \r4 | |
47 vswp \r1, \r5 | |
48 vswp \r2, \r6 | |
49 vswp \r3, \r7 | |
50 .endm | |
51 | |
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
53 vtrn.32 \r0, \r2 | |
54 vtrn.32 \r1, \r3 | |
55 vtrn.32 \r4, \r6 | |
56 vtrn.32 \r5, \r7 | |
57 vtrn.16 \r0, \r1 | |
58 vtrn.16 \r2, \r3 | |
59 vtrn.16 \r4, \r5 | |
60 vtrn.16 \r6, \r7 | |
61 .endm | |
62 | |
8336 | 63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
64 .macro h264_chroma_mc8 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
65 function ff_\type\()_h264_chroma_mc8_neon, export=1 |
8336 | 66 push {r4-r7, lr} |
67 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
68 .ifc \type,avg |
8336 | 69 mov lr, r0 |
70 .endif | |
71 pld [r1] | |
72 pld [r1, r2] | |
73 | |
74 muls r7, r4, r5 | |
75 rsb r6, r7, r5, lsl #3 | |
76 rsb ip, r7, r4, lsl #3 | |
77 sub r4, r7, r4, lsl #3 | |
78 sub r4, r4, r5, lsl #3 | |
79 add r4, r4, #64 | |
80 | |
81 beq 2f | |
82 | |
83 add r5, r1, r2 | |
84 | |
85 vdup.8 d0, r4 | |
86 lsl r4, r2, #1 | |
87 vdup.8 d1, ip | |
88 vld1.64 {d4, d5}, [r1], r4 | |
89 vdup.8 d2, r6 | |
90 vld1.64 {d6, d7}, [r5], r4 | |
91 vdup.8 d3, r7 | |
92 | |
93 vext.8 d5, d4, d5, #1 | |
94 vext.8 d7, d6, d7, #1 | |
95 | |
96 1: pld [r5] | |
97 vmull.u8 q8, d4, d0 | |
98 vmlal.u8 q8, d5, d1 | |
99 vld1.64 {d4, d5}, [r1], r4 | |
100 vmlal.u8 q8, d6, d2 | |
101 vext.8 d5, d4, d5, #1 | |
102 vmlal.u8 q8, d7, d3 | |
103 vmull.u8 q9, d6, d0 | |
104 subs r3, r3, #2 | |
105 vmlal.u8 q9, d7, d1 | |
106 vmlal.u8 q9, d4, d2 | |
107 vmlal.u8 q9, d5, d3 | |
108 vrshrn.u16 d16, q8, #6 | |
109 vld1.64 {d6, d7}, [r5], r4 | |
110 pld [r1] | |
111 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
112 .ifc \type,avg |
8336 | 113 vld1.64 {d20}, [lr,:64], r2 |
114 vld1.64 {d21}, [lr,:64], r2 | |
115 vrhadd.u8 q8, q8, q10 | |
116 .endif | |
117 vext.8 d7, d6, d7, #1 | |
118 vst1.64 {d16}, [r0,:64], r2 | |
119 vst1.64 {d17}, [r0,:64], r2 | |
120 bgt 1b | |
121 | |
122 pop {r4-r7, pc} | |
123 | |
124 2: tst r6, r6 | |
125 add ip, ip, r6 | |
126 vdup.8 d0, r4 | |
127 vdup.8 d1, ip | |
128 | |
129 beq 4f | |
130 | |
131 add r5, r1, r2 | |
132 lsl r4, r2, #1 | |
133 vld1.64 {d4}, [r1], r4 | |
134 vld1.64 {d6}, [r5], r4 | |
135 | |
136 3: pld [r5] | |
137 vmull.u8 q8, d4, d0 | |
138 vmlal.u8 q8, d6, d1 | |
139 vld1.64 {d4}, [r1], r4 | |
140 vmull.u8 q9, d6, d0 | |
141 vmlal.u8 q9, d4, d1 | |
142 vld1.64 {d6}, [r5], r4 | |
143 vrshrn.u16 d16, q8, #6 | |
144 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
145 .ifc \type,avg |
8336 | 146 vld1.64 {d20}, [lr,:64], r2 |
147 vld1.64 {d21}, [lr,:64], r2 | |
148 vrhadd.u8 q8, q8, q10 | |
149 .endif | |
150 subs r3, r3, #2 | |
151 pld [r1] | |
152 vst1.64 {d16}, [r0,:64], r2 | |
153 vst1.64 {d17}, [r0,:64], r2 | |
154 bgt 3b | |
155 | |
156 pop {r4-r7, pc} | |
157 | |
158 4: vld1.64 {d4, d5}, [r1], r2 | |
159 vld1.64 {d6, d7}, [r1], r2 | |
160 vext.8 d5, d4, d5, #1 | |
161 vext.8 d7, d6, d7, #1 | |
162 | |
163 5: pld [r1] | |
164 subs r3, r3, #2 | |
165 vmull.u8 q8, d4, d0 | |
166 vmlal.u8 q8, d5, d1 | |
167 vld1.64 {d4, d5}, [r1], r2 | |
168 vmull.u8 q9, d6, d0 | |
169 vmlal.u8 q9, d7, d1 | |
170 pld [r1] | |
171 vext.8 d5, d4, d5, #1 | |
172 vrshrn.u16 d16, q8, #6 | |
173 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
174 .ifc \type,avg |
8336 | 175 vld1.64 {d20}, [lr,:64], r2 |
176 vld1.64 {d21}, [lr,:64], r2 | |
177 vrhadd.u8 q8, q8, q10 | |
178 .endif | |
179 vld1.64 {d6, d7}, [r1], r2 | |
180 vext.8 d7, d6, d7, #1 | |
181 vst1.64 {d16}, [r0,:64], r2 | |
182 vst1.64 {d17}, [r0,:64], r2 | |
183 bgt 5b | |
184 | |
185 pop {r4-r7, pc} | |
11443 | 186 endfunc |
8336 | 187 .endm |
188 | |
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
190 .macro h264_chroma_mc4 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
191 function ff_\type\()_h264_chroma_mc4_neon, export=1 |
8336 | 192 push {r4-r7, lr} |
193 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
194 .ifc \type,avg |
8336 | 195 mov lr, r0 |
196 .endif | |
197 pld [r1] | |
198 pld [r1, r2] | |
199 | |
200 muls r7, r4, r5 | |
201 rsb r6, r7, r5, lsl #3 | |
202 rsb ip, r7, r4, lsl #3 | |
203 sub r4, r7, r4, lsl #3 | |
204 sub r4, r4, r5, lsl #3 | |
205 add r4, r4, #64 | |
206 | |
207 beq 2f | |
208 | |
209 add r5, r1, r2 | |
210 | |
211 vdup.8 d0, r4 | |
212 lsl r4, r2, #1 | |
213 vdup.8 d1, ip | |
214 vld1.64 {d4}, [r1], r4 | |
215 vdup.8 d2, r6 | |
216 vld1.64 {d6}, [r5], r4 | |
217 vdup.8 d3, r7 | |
218 | |
219 vext.8 d5, d4, d5, #1 | |
220 vext.8 d7, d6, d7, #1 | |
221 vtrn.32 d4, d5 | |
222 vtrn.32 d6, d7 | |
223 | |
224 vtrn.32 d0, d1 | |
225 vtrn.32 d2, d3 | |
226 | |
227 1: pld [r5] | |
228 vmull.u8 q8, d4, d0 | |
229 vmlal.u8 q8, d6, d2 | |
230 vld1.64 {d4}, [r1], r4 | |
231 vext.8 d5, d4, d5, #1 | |
232 vtrn.32 d4, d5 | |
233 vmull.u8 q9, d6, d0 | |
234 vmlal.u8 q9, d4, d2 | |
235 vld1.64 {d6}, [r5], r4 | |
236 vadd.i16 d16, d16, d17 | |
237 vadd.i16 d17, d18, d19 | |
238 vrshrn.u16 d16, q8, #6 | |
239 subs r3, r3, #2 | |
240 pld [r1] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
241 .ifc \type,avg |
8336 | 242 vld1.32 {d20[0]}, [lr,:32], r2 |
243 vld1.32 {d20[1]}, [lr,:32], r2 | |
244 vrhadd.u8 d16, d16, d20 | |
245 .endif | |
246 vext.8 d7, d6, d7, #1 | |
247 vtrn.32 d6, d7 | |
248 vst1.32 {d16[0]}, [r0,:32], r2 | |
249 vst1.32 {d16[1]}, [r0,:32], r2 | |
250 bgt 1b | |
251 | |
252 pop {r4-r7, pc} | |
253 | |
254 2: tst r6, r6 | |
255 add ip, ip, r6 | |
256 vdup.8 d0, r4 | |
257 vdup.8 d1, ip | |
258 vtrn.32 d0, d1 | |
259 | |
260 beq 4f | |
261 | |
262 vext.32 d1, d0, d1, #1 | |
263 add r5, r1, r2 | |
264 lsl r4, r2, #1 | |
265 vld1.32 {d4[0]}, [r1], r4 | |
266 vld1.32 {d4[1]}, [r5], r4 | |
267 | |
268 3: pld [r5] | |
269 vmull.u8 q8, d4, d0 | |
270 vld1.32 {d4[0]}, [r1], r4 | |
271 vmull.u8 q9, d4, d1 | |
272 vld1.32 {d4[1]}, [r5], r4 | |
273 vadd.i16 d16, d16, d17 | |
274 vadd.i16 d17, d18, d19 | |
275 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
276 .ifc \type,avg |
8336 | 277 vld1.32 {d20[0]}, [lr,:32], r2 |
278 vld1.32 {d20[1]}, [lr,:32], r2 | |
279 vrhadd.u8 d16, d16, d20 | |
280 .endif | |
281 subs r3, r3, #2 | |
282 pld [r1] | |
283 vst1.32 {d16[0]}, [r0,:32], r2 | |
284 vst1.32 {d16[1]}, [r0,:32], r2 | |
285 bgt 3b | |
286 | |
287 pop {r4-r7, pc} | |
288 | |
289 4: vld1.64 {d4}, [r1], r2 | |
290 vld1.64 {d6}, [r1], r2 | |
291 vext.8 d5, d4, d5, #1 | |
292 vext.8 d7, d6, d7, #1 | |
293 vtrn.32 d4, d5 | |
294 vtrn.32 d6, d7 | |
295 | |
296 5: vmull.u8 q8, d4, d0 | |
297 vmull.u8 q9, d6, d0 | |
298 subs r3, r3, #2 | |
299 vld1.64 {d4}, [r1], r2 | |
300 vext.8 d5, d4, d5, #1 | |
301 vtrn.32 d4, d5 | |
302 vadd.i16 d16, d16, d17 | |
303 vadd.i16 d17, d18, d19 | |
304 pld [r1] | |
305 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
306 .ifc \type,avg |
8336 | 307 vld1.32 {d20[0]}, [lr,:32], r2 |
308 vld1.32 {d20[1]}, [lr,:32], r2 | |
309 vrhadd.u8 d16, d16, d20 | |
310 .endif | |
311 vld1.64 {d6}, [r1], r2 | |
312 vext.8 d7, d6, d7, #1 | |
313 vtrn.32 d6, d7 | |
314 pld [r1] | |
315 vst1.32 {d16[0]}, [r0,:32], r2 | |
316 vst1.32 {d16[1]}, [r0,:32], r2 | |
317 bgt 5b | |
318 | |
319 pop {r4-r7, pc} | |
11443 | 320 endfunc |
8336 | 321 .endm |
322 | |
10617 | 323 .macro h264_chroma_mc2 type |
324 function ff_\type\()_h264_chroma_mc2_neon, export=1 | |
325 push {r4-r6, lr} | |
326 ldr r4, [sp, #16] | |
327 ldr lr, [sp, #20] | |
328 pld [r1] | |
329 pld [r1, r2] | |
330 orrs r5, r4, lr | |
331 beq 2f | |
332 | |
333 mul r5, r4, lr | |
334 rsb r6, r5, lr, lsl #3 | |
335 rsb r12, r5, r4, lsl #3 | |
336 sub r4, r5, r4, lsl #3 | |
337 sub r4, r4, lr, lsl #3 | |
338 add r4, r4, #64 | |
339 vdup.8 d0, r4 | |
340 vdup.8 d2, r12 | |
341 vdup.8 d1, r6 | |
342 vdup.8 d3, r5 | |
343 vtrn.16 q0, q1 | |
344 1: | |
345 vld1.32 {d4[0]}, [r1], r2 | |
346 vld1.32 {d4[1]}, [r1], r2 | |
347 vrev64.32 d5, d4 | |
348 vld1.32 {d5[1]}, [r1] | |
349 vext.8 q3, q2, q2, #1 | |
350 vtrn.16 q2, q3 | |
351 vmull.u8 q8, d4, d0 | |
352 vmlal.u8 q8, d5, d1 | |
353 .ifc \type,avg | |
354 vld1.16 {d18[0]}, [r0,:16], r2 | |
355 vld1.16 {d18[1]}, [r0,:16] | |
356 sub r0, r0, r2 | |
357 .endif | |
358 vtrn.32 d16, d17 | |
359 vadd.i16 d16, d16, d17 | |
360 vrshrn.u16 d16, q8, #6 | |
361 .ifc \type,avg | |
362 vrhadd.u8 d16, d16, d18 | |
363 .endif | |
364 vst1.16 {d16[0]}, [r0,:16], r2 | |
365 vst1.16 {d16[1]}, [r0,:16], r2 | |
366 subs r3, r3, #2 | |
367 bgt 1b | |
368 pop {r4-r6, pc} | |
369 2: | |
370 .ifc \type,put | |
371 ldrh r5, [r1], r2 | |
372 strh r5, [r0], r2 | |
373 ldrh r6, [r1], r2 | |
374 strh r6, [r0], r2 | |
375 .else | |
376 vld1.16 {d16[0]}, [r1], r2 | |
377 vld1.16 {d16[1]}, [r1], r2 | |
378 vld1.16 {d18[0]}, [r0,:16], r2 | |
379 vld1.16 {d18[1]}, [r0,:16] | |
380 sub r0, r0, r2 | |
381 vrhadd.u8 d16, d16, d18 | |
382 vst1.16 {d16[0]}, [r0,:16], r2 | |
383 vst1.16 {d16[1]}, [r0,:16], r2 | |
384 .endif | |
385 subs r3, r3, #2 | |
386 bgt 2b | |
387 pop {r4-r6, pc} | |
11443 | 388 endfunc |
10617 | 389 .endm |
390 | |
8336 | 391 .text |
392 .align | |
393 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
394 h264_chroma_mc8 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
395 h264_chroma_mc8 avg |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
396 h264_chroma_mc4 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
397 h264_chroma_mc4 avg |
10617 | 398 h264_chroma_mc2 put |
399 h264_chroma_mc2 avg | |
8337 | 400 |
401 /* H.264 loop filter */ | |
402 | |
403 .macro h264_loop_filter_start | |
404 ldr ip, [sp] | |
405 tst r2, r2 | |
406 ldr ip, [ip] | |
407 tstne r3, r3 | |
408 vmov.32 d24[0], ip | |
409 and ip, ip, ip, lsl #16 | |
410 bxeq lr | |
411 ands ip, ip, ip, lsl #8 | |
412 bxlt lr | |
413 .endm | |
414 | |
415 .macro align_push_regs | |
416 and ip, sp, #15 | |
417 add ip, ip, #32 | |
418 sub sp, sp, ip | |
419 vst1.64 {d12-d15}, [sp,:128] | |
420 sub sp, sp, #32 | |
421 vst1.64 {d8-d11}, [sp,:128] | |
422 .endm | |
423 | |
424 .macro align_pop_regs | |
425 vld1.64 {d8-d11}, [sp,:128]! | |
426 vld1.64 {d12-d15}, [sp,:128], ip | |
427 .endm | |
428 | |
429 .macro h264_loop_filter_luma | |
430 vdup.8 q11, r2 @ alpha | |
431 vmovl.u8 q12, d24 | |
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
433 vmovl.u16 q12, d24 | |
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
435 vsli.16 q12, q12, #8 | |
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
437 vsli.32 q12, q12, #16 | |
438 vclt.u8 q6, q6, q11 @ < alpha | |
439 vdup.8 q11, r3 @ beta | |
440 vclt.s8 q7, q12, #0 | |
441 vclt.u8 q14, q14, q11 @ < beta | |
442 vclt.u8 q15, q15, q11 @ < beta | |
443 vbic q6, q6, q7 | |
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
445 vand q6, q6, q14 | |
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
447 vclt.u8 q4, q4, q11 @ < beta | |
448 vand q6, q6, q15 | |
449 vclt.u8 q5, q5, q11 @ < beta | |
450 vand q4, q4, q6 | |
451 vand q5, q5, q6 | |
452 vand q12, q12, q6 | |
453 vrhadd.u8 q14, q8, q0 | |
454 vsub.i8 q6, q12, q4 | |
455 vqadd.u8 q7, q9, q12 | |
456 vhadd.u8 q10, q10, q14 | |
457 vsub.i8 q6, q6, q5 | |
458 vhadd.u8 q14, q2, q14 | |
459 vmin.u8 q7, q7, q10 | |
460 vqsub.u8 q11, q9, q12 | |
461 vqadd.u8 q2, q1, q12 | |
462 vmax.u8 q7, q7, q11 | |
463 vqsub.u8 q11, q1, q12 | |
464 vmin.u8 q14, q2, q14 | |
465 vmovl.u8 q2, d0 | |
466 vmax.u8 q14, q14, q11 | |
467 vmovl.u8 q10, d1 | |
468 vsubw.u8 q2, q2, d16 | |
469 vsubw.u8 q10, q10, d17 | |
470 vshl.i16 q2, q2, #2 | |
471 vshl.i16 q10, q10, #2 | |
472 vaddw.u8 q2, q2, d18 | |
473 vaddw.u8 q10, q10, d19 | |
474 vsubw.u8 q2, q2, d2 | |
475 vsubw.u8 q10, q10, d3 | |
476 vrshrn.i16 d4, q2, #3 | |
477 vrshrn.i16 d5, q10, #3 | |
478 vbsl q4, q7, q9 | |
479 vbsl q5, q14, q1 | |
480 vneg.s8 q7, q6 | |
481 vmovl.u8 q14, d16 | |
482 vmin.s8 q2, q2, q6 | |
483 vmovl.u8 q6, d17 | |
484 vmax.s8 q2, q2, q7 | |
485 vmovl.u8 q11, d0 | |
486 vmovl.u8 q12, d1 | |
487 vaddw.s8 q14, q14, d4 | |
488 vaddw.s8 q6, q6, d5 | |
489 vsubw.s8 q11, q11, d4 | |
490 vsubw.s8 q12, q12, d5 | |
491 vqmovun.s16 d16, q14 | |
492 vqmovun.s16 d17, q6 | |
493 vqmovun.s16 d0, q11 | |
494 vqmovun.s16 d1, q12 | |
495 .endm | |
496 | |
497 function ff_h264_v_loop_filter_luma_neon, export=1 | |
498 h264_loop_filter_start | |
499 | |
500 vld1.64 {d0, d1}, [r0,:128], r1 | |
501 vld1.64 {d2, d3}, [r0,:128], r1 | |
502 vld1.64 {d4, d5}, [r0,:128], r1 | |
503 sub r0, r0, r1, lsl #2 | |
504 sub r0, r0, r1, lsl #1 | |
505 vld1.64 {d20,d21}, [r0,:128], r1 | |
506 vld1.64 {d18,d19}, [r0,:128], r1 | |
507 vld1.64 {d16,d17}, [r0,:128], r1 | |
508 | |
509 align_push_regs | |
510 | |
511 h264_loop_filter_luma | |
512 | |
513 sub r0, r0, r1, lsl #1 | |
514 vst1.64 {d8, d9}, [r0,:128], r1 | |
515 vst1.64 {d16,d17}, [r0,:128], r1 | |
516 vst1.64 {d0, d1}, [r0,:128], r1 | |
517 vst1.64 {d10,d11}, [r0,:128] | |
518 | |
519 align_pop_regs | |
520 bx lr | |
11443 | 521 endfunc |
8337 | 522 |
523 function ff_h264_h_loop_filter_luma_neon, export=1 | |
524 h264_loop_filter_start | |
525 | |
526 sub r0, r0, #4 | |
527 vld1.64 {d6}, [r0], r1 | |
528 vld1.64 {d20}, [r0], r1 | |
529 vld1.64 {d18}, [r0], r1 | |
530 vld1.64 {d16}, [r0], r1 | |
531 vld1.64 {d0}, [r0], r1 | |
532 vld1.64 {d2}, [r0], r1 | |
533 vld1.64 {d4}, [r0], r1 | |
534 vld1.64 {d26}, [r0], r1 | |
535 vld1.64 {d7}, [r0], r1 | |
536 vld1.64 {d21}, [r0], r1 | |
537 vld1.64 {d19}, [r0], r1 | |
538 vld1.64 {d17}, [r0], r1 | |
539 vld1.64 {d1}, [r0], r1 | |
540 vld1.64 {d3}, [r0], r1 | |
541 vld1.64 {d5}, [r0], r1 | |
542 vld1.64 {d27}, [r0], r1 | |
543 | |
8338 | 544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 |
8337 | 545 |
546 align_push_regs | |
547 | |
548 h264_loop_filter_luma | |
549 | |
9864 | 550 transpose_4x4 q4, q8, q0, q5 |
8337 | 551 |
552 sub r0, r0, r1, lsl #4 | |
9864 | 553 add r0, r0, #2 |
554 vst1.32 {d8[0]}, [r0], r1 | |
555 vst1.32 {d16[0]}, [r0], r1 | |
556 vst1.32 {d0[0]}, [r0], r1 | |
557 vst1.32 {d10[0]}, [r0], r1 | |
558 vst1.32 {d8[1]}, [r0], r1 | |
559 vst1.32 {d16[1]}, [r0], r1 | |
560 vst1.32 {d0[1]}, [r0], r1 | |
561 vst1.32 {d10[1]}, [r0], r1 | |
562 vst1.32 {d9[0]}, [r0], r1 | |
563 vst1.32 {d17[0]}, [r0], r1 | |
564 vst1.32 {d1[0]}, [r0], r1 | |
565 vst1.32 {d11[0]}, [r0], r1 | |
566 vst1.32 {d9[1]}, [r0], r1 | |
567 vst1.32 {d17[1]}, [r0], r1 | |
568 vst1.32 {d1[1]}, [r0], r1 | |
569 vst1.32 {d11[1]}, [r0], r1 | |
8337 | 570 |
571 align_pop_regs | |
572 bx lr | |
11443 | 573 endfunc |
8337 | 574 |
575 .macro h264_loop_filter_chroma | |
576 vdup.8 d22, r2 @ alpha | |
577 vmovl.u8 q12, d24 | |
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
579 vmovl.u8 q2, d0 | |
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
581 vsubw.u8 q2, q2, d16 | |
582 vsli.16 d24, d24, #8 | |
583 vshl.i16 q2, q2, #2 | |
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
585 vaddw.u8 q2, q2, d18 | |
586 vclt.u8 d26, d26, d22 @ < alpha | |
587 vsubw.u8 q2, q2, d2 | |
588 vdup.8 d22, r3 @ beta | |
589 vrshrn.i16 d4, q2, #3 | |
590 vclt.u8 d28, d28, d22 @ < beta | |
591 vclt.u8 d30, d30, d22 @ < beta | |
12167 | 592 vmin.s8 d4, d4, d24 |
593 vneg.s8 d25, d24 | |
8337 | 594 vand d26, d26, d28 |
12167 | 595 vmax.s8 d4, d4, d25 |
8337 | 596 vand d26, d26, d30 |
12167 | 597 vmovl.u8 q11, d0 |
598 vand d4, d4, d26 | |
8337 | 599 vmovl.u8 q14, d16 |
600 vaddw.s8 q14, q14, d4 | |
601 vsubw.s8 q11, q11, d4 | |
602 vqmovun.s16 d16, q14 | |
603 vqmovun.s16 d0, q11 | |
604 .endm | |
605 | |
606 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
607 h264_loop_filter_start | |
608 | |
609 sub r0, r0, r1, lsl #1 | |
610 vld1.64 {d18}, [r0,:64], r1 | |
611 vld1.64 {d16}, [r0,:64], r1 | |
612 vld1.64 {d0}, [r0,:64], r1 | |
613 vld1.64 {d2}, [r0,:64] | |
614 | |
615 h264_loop_filter_chroma | |
616 | |
617 sub r0, r0, r1, lsl #1 | |
618 vst1.64 {d16}, [r0,:64], r1 | |
619 vst1.64 {d0}, [r0,:64], r1 | |
620 | |
621 bx lr | |
11443 | 622 endfunc |
8337 | 623 |
624 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
625 h264_loop_filter_start | |
626 | |
627 sub r0, r0, #2 | |
628 vld1.32 {d18[0]}, [r0], r1 | |
629 vld1.32 {d16[0]}, [r0], r1 | |
630 vld1.32 {d0[0]}, [r0], r1 | |
631 vld1.32 {d2[0]}, [r0], r1 | |
632 vld1.32 {d18[1]}, [r0], r1 | |
633 vld1.32 {d16[1]}, [r0], r1 | |
634 vld1.32 {d0[1]}, [r0], r1 | |
635 vld1.32 {d2[1]}, [r0], r1 | |
636 | |
637 vtrn.16 d18, d0 | |
638 vtrn.16 d16, d2 | |
639 vtrn.8 d18, d16 | |
640 vtrn.8 d0, d2 | |
641 | |
642 h264_loop_filter_chroma | |
643 | |
644 vtrn.16 d18, d0 | |
645 vtrn.16 d16, d2 | |
646 vtrn.8 d18, d16 | |
647 vtrn.8 d0, d2 | |
648 | |
649 sub r0, r0, r1, lsl #3 | |
650 vst1.32 {d18[0]}, [r0], r1 | |
651 vst1.32 {d16[0]}, [r0], r1 | |
652 vst1.32 {d0[0]}, [r0], r1 | |
653 vst1.32 {d2[0]}, [r0], r1 | |
654 vst1.32 {d18[1]}, [r0], r1 | |
655 vst1.32 {d16[1]}, [r0], r1 | |
656 vst1.32 {d0[1]}, [r0], r1 | |
657 vst1.32 {d2[1]}, [r0], r1 | |
658 | |
659 bx lr | |
11443 | 660 endfunc |
8338 | 661 |
662 /* H.264 qpel MC */ | |
663 | |
664 .macro lowpass_const r | |
665 movw \r, #5 | |
666 movt \r, #20 | |
667 vmov.32 d6[0], \r | |
668 .endm | |
669 | |
670 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
671 .if \narrow | |
672 t0 .req q0 | |
673 t1 .req q8 | |
674 .else | |
675 t0 .req \d0 | |
676 t1 .req \d1 | |
677 .endif | |
678 vext.8 d2, \r0, \r1, #2 | |
679 vext.8 d3, \r0, \r1, #3 | |
680 vaddl.u8 q1, d2, d3 | |
681 vext.8 d4, \r0, \r1, #1 | |
682 vext.8 d5, \r0, \r1, #4 | |
683 vaddl.u8 q2, d4, d5 | |
684 vext.8 d30, \r0, \r1, #5 | |
685 vaddl.u8 t0, \r0, d30 | |
686 vext.8 d18, \r2, \r3, #2 | |
687 vmla.i16 t0, q1, d6[1] | |
688 vext.8 d19, \r2, \r3, #3 | |
689 vaddl.u8 q9, d18, d19 | |
690 vext.8 d20, \r2, \r3, #1 | |
691 vmls.i16 t0, q2, d6[0] | |
692 vext.8 d21, \r2, \r3, #4 | |
693 vaddl.u8 q10, d20, d21 | |
694 vext.8 d31, \r2, \r3, #5 | |
695 vaddl.u8 t1, \r2, d31 | |
696 vmla.i16 t1, q9, d6[1] | |
697 vmls.i16 t1, q10, d6[0] | |
698 .if \narrow | |
699 vqrshrun.s16 \d0, t0, #5 | |
700 vqrshrun.s16 \d1, t1, #5 | |
701 .endif | |
702 .unreq t0 | |
703 .unreq t1 | |
704 .endm | |
705 | |
706 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
707 .if \narrow | |
708 t0 .req q0 | |
709 .else | |
710 t0 .req \d0 | |
711 .endif | |
712 vext.8 d2, \r0, \r1, #2 | |
713 vext.8 d3, \r0, \r1, #3 | |
714 vaddl.u8 q1, d2, d3 | |
715 vext.8 d4, \r0, \r1, #1 | |
716 vext.8 d5, \r0, \r1, #4 | |
717 vaddl.u8 q2, d4, d5 | |
718 vext.8 d30, \r0, \r1, #5 | |
719 vaddl.u8 t0, \r0, d30 | |
720 vmla.i16 t0, q1, d6[1] | |
721 vmls.i16 t0, q2, d6[0] | |
722 .if \narrow | |
723 vqrshrun.s16 \d0, t0, #5 | |
724 .endif | |
725 .unreq t0 | |
726 .endm | |
727 | |
728 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
729 vext.16 q1, \r0, \r1, #2 | |
730 vext.16 q0, \r0, \r1, #3 | |
731 vaddl.s16 q9, d2, d0 | |
732 vext.16 q2, \r0, \r1, #1 | |
733 vaddl.s16 q1, d3, d1 | |
734 vext.16 q3, \r0, \r1, #4 | |
735 vaddl.s16 q10, d4, d6 | |
736 vext.16 \r1, \r0, \r1, #5 | |
737 vaddl.s16 q2, d5, d7 | |
738 vaddl.s16 q0, \h0, \h1 | |
739 vaddl.s16 q8, \l0, \l1 | |
740 | |
741 vshl.i32 q3, q9, #4 | |
742 vshl.i32 q9, q9, #2 | |
743 vshl.i32 q15, q10, #2 | |
744 vadd.i32 q9, q9, q3 | |
745 vadd.i32 q10, q10, q15 | |
746 | |
747 vshl.i32 q3, q1, #4 | |
748 vshl.i32 q1, q1, #2 | |
749 vshl.i32 q15, q2, #2 | |
750 vadd.i32 q1, q1, q3 | |
751 vadd.i32 q2, q2, q15 | |
752 | |
753 vadd.i32 q9, q9, q8 | |
754 vsub.i32 q9, q9, q10 | |
755 | |
756 vadd.i32 q1, q1, q0 | |
757 vsub.i32 q1, q1, q2 | |
758 | |
759 vrshrn.s32 d18, q9, #10 | |
760 vrshrn.s32 d19, q1, #10 | |
761 | |
762 vqmovun.s16 \d, q9 | |
763 .endm | |
764 | |
765 function put_h264_qpel16_h_lowpass_neon_packed | |
766 mov r4, lr | |
767 mov ip, #16 | |
768 mov r3, #8 | |
769 bl put_h264_qpel8_h_lowpass_neon | |
770 sub r1, r1, r2, lsl #4 | |
771 add r1, r1, #8 | |
772 mov ip, #16 | |
773 mov lr, r4 | |
774 b put_h264_qpel8_h_lowpass_neon | |
11443 | 775 endfunc |
8338 | 776 |
10616 | 777 .macro h264_qpel_h_lowpass type |
778 function \type\()_h264_qpel16_h_lowpass_neon | |
8338 | 779 push {lr} |
780 mov ip, #16 | |
10616 | 781 bl \type\()_h264_qpel8_h_lowpass_neon |
8338 | 782 sub r0, r0, r3, lsl #4 |
783 sub r1, r1, r2, lsl #4 | |
784 add r0, r0, #8 | |
785 add r1, r1, #8 | |
786 mov ip, #16 | |
787 pop {lr} | |
11443 | 788 endfunc |
8338 | 789 |
10616 | 790 function \type\()_h264_qpel8_h_lowpass_neon |
8338 | 791 1: vld1.64 {d0, d1}, [r1], r2 |
792 vld1.64 {d16,d17}, [r1], r2 | |
793 subs ip, ip, #2 | |
794 lowpass_8 d0, d1, d16, d17, d0, d16 | |
10616 | 795 .ifc \type,avg |
796 vld1.8 {d2}, [r0,:64], r3 | |
797 vrhadd.u8 d0, d0, d2 | |
798 vld1.8 {d3}, [r0,:64] | |
799 vrhadd.u8 d16, d16, d3 | |
800 sub r0, r0, r3 | |
801 .endif | |
8338 | 802 vst1.64 {d0}, [r0,:64], r3 |
803 vst1.64 {d16}, [r0,:64], r3 | |
804 bne 1b | |
805 bx lr | |
11443 | 806 endfunc |
10616 | 807 .endm |
8338 | 808 |
10616 | 809 h264_qpel_h_lowpass put |
810 h264_qpel_h_lowpass avg | |
811 | |
812 .macro h264_qpel_h_lowpass_l2 type | |
813 function \type\()_h264_qpel16_h_lowpass_l2_neon | |
8338 | 814 push {lr} |
815 mov ip, #16 | |
10616 | 816 bl \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 817 sub r0, r0, r2, lsl #4 |
818 sub r1, r1, r2, lsl #4 | |
819 sub r3, r3, r2, lsl #4 | |
820 add r0, r0, #8 | |
821 add r1, r1, #8 | |
822 add r3, r3, #8 | |
823 mov ip, #16 | |
824 pop {lr} | |
11443 | 825 endfunc |
8338 | 826 |
10616 | 827 function \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 828 1: vld1.64 {d0, d1}, [r1], r2 |
829 vld1.64 {d16,d17}, [r1], r2 | |
830 vld1.64 {d28}, [r3], r2 | |
831 vld1.64 {d29}, [r3], r2 | |
832 subs ip, ip, #2 | |
833 lowpass_8 d0, d1, d16, d17, d0, d1 | |
834 vrhadd.u8 q0, q0, q14 | |
10616 | 835 .ifc \type,avg |
836 vld1.8 {d2}, [r0,:64], r2 | |
837 vrhadd.u8 d0, d0, d2 | |
838 vld1.8 {d3}, [r0,:64] | |
839 vrhadd.u8 d1, d1, d3 | |
840 sub r0, r0, r2 | |
841 .endif | |
8338 | 842 vst1.64 {d0}, [r0,:64], r2 |
843 vst1.64 {d1}, [r0,:64], r2 | |
844 bne 1b | |
845 bx lr | |
11443 | 846 endfunc |
10616 | 847 .endm |
848 | |
849 h264_qpel_h_lowpass_l2 put | |
850 h264_qpel_h_lowpass_l2 avg | |
8338 | 851 |
852 function put_h264_qpel16_v_lowpass_neon_packed | |
853 mov r4, lr | |
854 mov r2, #8 | |
855 bl put_h264_qpel8_v_lowpass_neon | |
856 sub r1, r1, r3, lsl #2 | |
857 bl put_h264_qpel8_v_lowpass_neon | |
858 sub r1, r1, r3, lsl #4 | |
859 sub r1, r1, r3, lsl #2 | |
860 add r1, r1, #8 | |
861 bl put_h264_qpel8_v_lowpass_neon | |
862 sub r1, r1, r3, lsl #2 | |
863 mov lr, r4 | |
864 b put_h264_qpel8_v_lowpass_neon | |
11443 | 865 endfunc |
8338 | 866 |
10616 | 867 .macro h264_qpel_v_lowpass type |
868 function \type\()_h264_qpel16_v_lowpass_neon | |
8338 | 869 mov r4, lr |
10616 | 870 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 871 sub r1, r1, r3, lsl #2 |
10616 | 872 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 873 sub r0, r0, r2, lsl #4 |
874 add r0, r0, #8 | |
875 sub r1, r1, r3, lsl #4 | |
876 sub r1, r1, r3, lsl #2 | |
877 add r1, r1, #8 | |
10616 | 878 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 879 sub r1, r1, r3, lsl #2 |
880 mov lr, r4 | |
11443 | 881 endfunc |
8338 | 882 |
10616 | 883 function \type\()_h264_qpel8_v_lowpass_neon |
8338 | 884 vld1.64 {d8}, [r1], r3 |
885 vld1.64 {d10}, [r1], r3 | |
886 vld1.64 {d12}, [r1], r3 | |
887 vld1.64 {d14}, [r1], r3 | |
888 vld1.64 {d22}, [r1], r3 | |
889 vld1.64 {d24}, [r1], r3 | |
890 vld1.64 {d26}, [r1], r3 | |
891 vld1.64 {d28}, [r1], r3 | |
892 vld1.64 {d9}, [r1], r3 | |
893 vld1.64 {d11}, [r1], r3 | |
894 vld1.64 {d13}, [r1], r3 | |
895 vld1.64 {d15}, [r1], r3 | |
896 vld1.64 {d23}, [r1] | |
897 | |
898 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
899 lowpass_8 d8, d9, d10, d11, d8, d10 | |
900 lowpass_8 d12, d13, d14, d15, d12, d14 | |
901 lowpass_8 d22, d23, d24, d25, d22, d24 | |
902 lowpass_8 d26, d27, d28, d29, d26, d28 | |
903 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
904 | |
10616 | 905 .ifc \type,avg |
906 vld1.8 {d9}, [r0,:64], r2 | |
907 vrhadd.u8 d8, d8, d9 | |
908 vld1.8 {d11}, [r0,:64], r2 | |
909 vrhadd.u8 d10, d10, d11 | |
910 vld1.8 {d13}, [r0,:64], r2 | |
911 vrhadd.u8 d12, d12, d13 | |
912 vld1.8 {d15}, [r0,:64], r2 | |
913 vrhadd.u8 d14, d14, d15 | |
914 vld1.8 {d23}, [r0,:64], r2 | |
915 vrhadd.u8 d22, d22, d23 | |
916 vld1.8 {d25}, [r0,:64], r2 | |
917 vrhadd.u8 d24, d24, d25 | |
918 vld1.8 {d27}, [r0,:64], r2 | |
919 vrhadd.u8 d26, d26, d27 | |
920 vld1.8 {d29}, [r0,:64], r2 | |
921 vrhadd.u8 d28, d28, d29 | |
922 sub r0, r0, r2, lsl #3 | |
923 .endif | |
924 | |
8338 | 925 vst1.64 {d8}, [r0,:64], r2 |
926 vst1.64 {d10}, [r0,:64], r2 | |
927 vst1.64 {d12}, [r0,:64], r2 | |
928 vst1.64 {d14}, [r0,:64], r2 | |
929 vst1.64 {d22}, [r0,:64], r2 | |
930 vst1.64 {d24}, [r0,:64], r2 | |
931 vst1.64 {d26}, [r0,:64], r2 | |
932 vst1.64 {d28}, [r0,:64], r2 | |
933 | |
934 bx lr | |
11443 | 935 endfunc |
10616 | 936 .endm |
8338 | 937 |
10616 | 938 h264_qpel_v_lowpass put |
939 h264_qpel_v_lowpass avg | |
940 | |
941 .macro h264_qpel_v_lowpass_l2 type | |
942 function \type\()_h264_qpel16_v_lowpass_l2_neon | |
8338 | 943 mov r4, lr |
10616 | 944 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 945 sub r1, r1, r3, lsl #2 |
10616 | 946 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 947 sub r0, r0, r3, lsl #4 |
948 sub ip, ip, r2, lsl #4 | |
949 add r0, r0, #8 | |
950 add ip, ip, #8 | |
951 sub r1, r1, r3, lsl #4 | |
952 sub r1, r1, r3, lsl #2 | |
953 add r1, r1, #8 | |
10616 | 954 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 955 sub r1, r1, r3, lsl #2 |
956 mov lr, r4 | |
11443 | 957 endfunc |
8338 | 958 |
10616 | 959 function \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 960 vld1.64 {d8}, [r1], r3 |
961 vld1.64 {d10}, [r1], r3 | |
962 vld1.64 {d12}, [r1], r3 | |
963 vld1.64 {d14}, [r1], r3 | |
964 vld1.64 {d22}, [r1], r3 | |
965 vld1.64 {d24}, [r1], r3 | |
966 vld1.64 {d26}, [r1], r3 | |
967 vld1.64 {d28}, [r1], r3 | |
968 vld1.64 {d9}, [r1], r3 | |
969 vld1.64 {d11}, [r1], r3 | |
970 vld1.64 {d13}, [r1], r3 | |
971 vld1.64 {d15}, [r1], r3 | |
972 vld1.64 {d23}, [r1] | |
973 | |
974 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
975 lowpass_8 d8, d9, d10, d11, d8, d9 | |
976 lowpass_8 d12, d13, d14, d15, d12, d13 | |
977 lowpass_8 d22, d23, d24, d25, d22, d23 | |
978 lowpass_8 d26, d27, d28, d29, d26, d27 | |
979 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
980 | |
981 vld1.64 {d0}, [ip], r2 | |
982 vld1.64 {d1}, [ip], r2 | |
983 vld1.64 {d2}, [ip], r2 | |
984 vld1.64 {d3}, [ip], r2 | |
985 vld1.64 {d4}, [ip], r2 | |
986 vrhadd.u8 q0, q0, q4 | |
987 vld1.64 {d5}, [ip], r2 | |
988 vrhadd.u8 q1, q1, q6 | |
989 vld1.64 {d10}, [ip], r2 | |
990 vrhadd.u8 q2, q2, q11 | |
991 vld1.64 {d11}, [ip], r2 | |
10616 | 992 vrhadd.u8 q5, q5, q13 |
993 | |
994 .ifc \type,avg | |
995 vld1.8 {d16}, [r0,:64], r3 | |
996 vrhadd.u8 d0, d0, d16 | |
997 vld1.8 {d17}, [r0,:64], r3 | |
998 vrhadd.u8 d1, d1, d17 | |
999 vld1.8 {d16}, [r0,:64], r3 | |
1000 vrhadd.u8 d2, d2, d16 | |
1001 vld1.8 {d17}, [r0,:64], r3 | |
1002 vrhadd.u8 d3, d3, d17 | |
1003 vld1.8 {d16}, [r0,:64], r3 | |
1004 vrhadd.u8 d4, d4, d16 | |
1005 vld1.8 {d17}, [r0,:64], r3 | |
1006 vrhadd.u8 d5, d5, d17 | |
1007 vld1.8 {d16}, [r0,:64], r3 | |
1008 vrhadd.u8 d10, d10, d16 | |
1009 vld1.8 {d17}, [r0,:64], r3 | |
1010 vrhadd.u8 d11, d11, d17 | |
1011 sub r0, r0, r3, lsl #3 | |
1012 .endif | |
8338 | 1013 |
1014 vst1.64 {d0}, [r0,:64], r3 | |
1015 vst1.64 {d1}, [r0,:64], r3 | |
1016 vst1.64 {d2}, [r0,:64], r3 | |
1017 vst1.64 {d3}, [r0,:64], r3 | |
1018 vst1.64 {d4}, [r0,:64], r3 | |
1019 vst1.64 {d5}, [r0,:64], r3 | |
1020 vst1.64 {d10}, [r0,:64], r3 | |
1021 vst1.64 {d11}, [r0,:64], r3 | |
1022 | |
1023 bx lr | |
11443 | 1024 endfunc |
10616 | 1025 .endm |
1026 | |
1027 h264_qpel_v_lowpass_l2 put | |
1028 h264_qpel_v_lowpass_l2 avg | |
8338 | 1029 |
1030 function put_h264_qpel8_hv_lowpass_neon_top | |
1031 lowpass_const ip | |
1032 mov ip, #12 | |
1033 1: vld1.64 {d0, d1}, [r1], r3 | |
1034 vld1.64 {d16,d17}, [r1], r3 | |
1035 subs ip, ip, #2 | |
1036 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
1037 vst1.64 {d22-d25}, [r4,:128]! | |
1038 bne 1b | |
1039 | |
1040 vld1.64 {d0, d1}, [r1] | |
1041 lowpass_8_1 d0, d1, q12, narrow=0 | |
1042 | |
1043 mov ip, #-16 | |
1044 add r4, r4, ip | |
1045 vld1.64 {d30,d31}, [r4,:128], ip | |
1046 vld1.64 {d20,d21}, [r4,:128], ip | |
1047 vld1.64 {d18,d19}, [r4,:128], ip | |
1048 vld1.64 {d16,d17}, [r4,:128], ip | |
1049 vld1.64 {d14,d15}, [r4,:128], ip | |
1050 vld1.64 {d12,d13}, [r4,:128], ip | |
1051 vld1.64 {d10,d11}, [r4,:128], ip | |
1052 vld1.64 {d8, d9}, [r4,:128], ip | |
1053 vld1.64 {d6, d7}, [r4,:128], ip | |
1054 vld1.64 {d4, d5}, [r4,:128], ip | |
1055 vld1.64 {d2, d3}, [r4,:128], ip | |
1056 vld1.64 {d0, d1}, [r4,:128] | |
1057 | |
1058 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
1059 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
1060 | |
1061 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
1062 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
1063 | |
1064 vst1.64 {d30,d31}, [r4,:128]! | |
1065 vst1.64 {d6, d7}, [r4,:128]! | |
1066 vst1.64 {d20,d21}, [r4,:128]! | |
1067 vst1.64 {d4, d5}, [r4,:128]! | |
1068 vst1.64 {d18,d19}, [r4,:128]! | |
1069 vst1.64 {d2, d3}, [r4,:128]! | |
1070 vst1.64 {d16,d17}, [r4,:128]! | |
1071 vst1.64 {d0, d1}, [r4,:128] | |
1072 | |
1073 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
1074 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
1075 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
1076 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
1077 | |
1078 vld1.64 {d16,d17}, [r4,:128], ip | |
1079 vld1.64 {d30,d31}, [r4,:128], ip | |
1080 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
1081 vld1.64 {d16,d17}, [r4,:128], ip | |
1082 vld1.64 {d30,d31}, [r4,:128], ip | |
1083 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
1084 vld1.64 {d16,d17}, [r4,:128], ip | |
1085 vld1.64 {d30,d31}, [r4,:128], ip | |
1086 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
1087 vld1.64 {d16,d17}, [r4,:128], ip | |
1088 vld1.64 {d30,d31}, [r4,:128] | |
1089 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
1090 | |
1091 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
1092 | |
1093 bx lr | |
11443 | 1094 endfunc |
8338 | 1095 |
10616 | 1096 .macro h264_qpel8_hv_lowpass type |
1097 function \type\()_h264_qpel8_hv_lowpass_neon | |
8338 | 1098 mov r10, lr |
1099 bl put_h264_qpel8_hv_lowpass_neon_top | |
10616 | 1100 .ifc \type,avg |
1101 vld1.8 {d0}, [r0,:64], r2 | |
1102 vrhadd.u8 d12, d12, d0 | |
1103 vld1.8 {d1}, [r0,:64], r2 | |
1104 vrhadd.u8 d13, d13, d1 | |
1105 vld1.8 {d2}, [r0,:64], r2 | |
1106 vrhadd.u8 d14, d14, d2 | |
1107 vld1.8 {d3}, [r0,:64], r2 | |
1108 vrhadd.u8 d15, d15, d3 | |
1109 vld1.8 {d4}, [r0,:64], r2 | |
1110 vrhadd.u8 d8, d8, d4 | |
1111 vld1.8 {d5}, [r0,:64], r2 | |
1112 vrhadd.u8 d9, d9, d5 | |
1113 vld1.8 {d6}, [r0,:64], r2 | |
1114 vrhadd.u8 d10, d10, d6 | |
1115 vld1.8 {d7}, [r0,:64], r2 | |
1116 vrhadd.u8 d11, d11, d7 | |
1117 sub r0, r0, r2, lsl #3 | |
1118 .endif | |
8338 | 1119 vst1.64 {d12}, [r0,:64], r2 |
1120 vst1.64 {d13}, [r0,:64], r2 | |
1121 vst1.64 {d14}, [r0,:64], r2 | |
1122 vst1.64 {d15}, [r0,:64], r2 | |
1123 vst1.64 {d8}, [r0,:64], r2 | |
1124 vst1.64 {d9}, [r0,:64], r2 | |
1125 vst1.64 {d10}, [r0,:64], r2 | |
1126 vst1.64 {d11}, [r0,:64], r2 | |
1127 | |
1128 mov lr, r10 | |
1129 bx lr | |
11443 | 1130 endfunc |
10616 | 1131 .endm |
8338 | 1132 |
10616 | 1133 h264_qpel8_hv_lowpass put |
1134 h264_qpel8_hv_lowpass avg | |
1135 | |
1136 .macro h264_qpel8_hv_lowpass_l2 type | |
1137 function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
8338 | 1138 mov r10, lr |
1139 bl put_h264_qpel8_hv_lowpass_neon_top | |
1140 | |
1141 vld1.64 {d0, d1}, [r2,:128]! | |
1142 vld1.64 {d2, d3}, [r2,:128]! | |
1143 vrhadd.u8 q0, q0, q6 | |
1144 vld1.64 {d4, d5}, [r2,:128]! | |
1145 vrhadd.u8 q1, q1, q7 | |
1146 vld1.64 {d6, d7}, [r2,:128]! | |
1147 vrhadd.u8 q2, q2, q4 | |
10616 | 1148 vrhadd.u8 q3, q3, q5 |
1149 .ifc \type,avg | |
1150 vld1.8 {d16}, [r0,:64], r3 | |
1151 vrhadd.u8 d0, d0, d16 | |
1152 vld1.8 {d17}, [r0,:64], r3 | |
1153 vrhadd.u8 d1, d1, d17 | |
1154 vld1.8 {d18}, [r0,:64], r3 | |
1155 vrhadd.u8 d2, d2, d18 | |
1156 vld1.8 {d19}, [r0,:64], r3 | |
1157 vrhadd.u8 d3, d3, d19 | |
1158 vld1.8 {d20}, [r0,:64], r3 | |
1159 vrhadd.u8 d4, d4, d20 | |
1160 vld1.8 {d21}, [r0,:64], r3 | |
1161 vrhadd.u8 d5, d5, d21 | |
1162 vld1.8 {d22}, [r0,:64], r3 | |
1163 vrhadd.u8 d6, d6, d22 | |
1164 vld1.8 {d23}, [r0,:64], r3 | |
1165 vrhadd.u8 d7, d7, d23 | |
1166 sub r0, r0, r3, lsl #3 | |
1167 .endif | |
8338 | 1168 vst1.64 {d0}, [r0,:64], r3 |
1169 vst1.64 {d1}, [r0,:64], r3 | |
1170 vst1.64 {d2}, [r0,:64], r3 | |
1171 vst1.64 {d3}, [r0,:64], r3 | |
1172 vst1.64 {d4}, [r0,:64], r3 | |
1173 vst1.64 {d5}, [r0,:64], r3 | |
1174 vst1.64 {d6}, [r0,:64], r3 | |
1175 vst1.64 {d7}, [r0,:64], r3 | |
1176 | |
1177 mov lr, r10 | |
1178 bx lr | |
11443 | 1179 endfunc |
10616 | 1180 .endm |
8338 | 1181 |
10616 | 1182 h264_qpel8_hv_lowpass_l2 put |
1183 h264_qpel8_hv_lowpass_l2 avg | |
1184 | |
1185 .macro h264_qpel16_hv type | |
1186 function \type\()_h264_qpel16_hv_lowpass_neon | |
8338 | 1187 mov r9, lr |
10616 | 1188 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1189 sub r1, r1, r3, lsl #2 |
10616 | 1190 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1191 sub r1, r1, r3, lsl #4 |
1192 sub r1, r1, r3, lsl #2 | |
1193 add r1, r1, #8 | |
1194 sub r0, r0, r2, lsl #4 | |
1195 add r0, r0, #8 | |
10616 | 1196 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1197 sub r1, r1, r3, lsl #2 |
1198 mov lr, r9 | |
10616 | 1199 b \type\()_h264_qpel8_hv_lowpass_neon |
11443 | 1200 endfunc |
8338 | 1201 |
10616 | 1202 function \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1203 mov r9, lr |
1204 sub r2, r4, #256 | |
10616 | 1205 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1206 sub r1, r1, r3, lsl #2 |
10616 | 1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1208 sub r1, r1, r3, lsl #4 |
1209 sub r1, r1, r3, lsl #2 | |
1210 add r1, r1, #8 | |
1211 sub r0, r0, r3, lsl #4 | |
1212 add r0, r0, #8 | |
10616 | 1213 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1214 sub r1, r1, r3, lsl #2 |
1215 mov lr, r9 | |
10616 | 1216 b \type\()_h264_qpel8_hv_lowpass_l2_neon |
11443 | 1217 endfunc |
10616 | 1218 .endm |
8338 | 1219 |
10616 | 1220 h264_qpel16_hv put |
1221 h264_qpel16_hv avg | |
1222 | |
1223 .macro h264_qpel8 type | |
1224 function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
8338 | 1225 lowpass_const r3 |
1226 mov r3, r1 | |
1227 sub r1, r1, #2 | |
1228 mov ip, #8 | |
10616 | 1229 b \type\()_h264_qpel8_h_lowpass_l2_neon |
11443 | 1230 endfunc |
8338 | 1231 |
10616 | 1232 function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
8338 | 1233 lowpass_const r3 |
1234 sub r1, r1, #2 | |
1235 mov r3, r2 | |
1236 mov ip, #8 | |
10616 | 1237 b \type\()_h264_qpel8_h_lowpass_neon |
11443 | 1238 endfunc |
8338 | 1239 |
10616 | 1240 function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
8338 | 1241 lowpass_const r3 |
1242 add r3, r1, #1 | |
1243 sub r1, r1, #2 | |
1244 mov ip, #8 | |
10616 | 1245 b \type\()_h264_qpel8_h_lowpass_l2_neon |
11443 | 1246 endfunc |
8338 | 1247 |
10616 | 1248 function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
8338 | 1249 push {lr} |
1250 mov ip, r1 | |
10616 | 1251 \type\()_h264_qpel8_mc01: |
8338 | 1252 lowpass_const r3 |
1253 mov r3, r2 | |
1254 sub r1, r1, r2, lsl #1 | |
1255 vpush {d8-d15} | |
10616 | 1256 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1257 vpop {d8-d15} |
1258 pop {pc} | |
11443 | 1259 endfunc |
8338 | 1260 |
10616 | 1261 function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
10385 | 1262 push {r0, r1, r11, lr} |
10616 | 1263 \type\()_h264_qpel8_mc11: |
8338 | 1264 lowpass_const r3 |
10385 | 1265 mov r11, sp |
1266 bic sp, sp, #15 | |
8338 | 1267 sub sp, sp, #64 |
1268 mov r0, sp | |
1269 sub r1, r1, #2 | |
1270 mov r3, #8 | |
1271 mov ip, #8 | |
1272 vpush {d8-d15} | |
1273 bl put_h264_qpel8_h_lowpass_neon | |
10385 | 1274 ldrd r0, [r11] |
8338 | 1275 mov r3, r2 |
1276 add ip, sp, #64 | |
1277 sub r1, r1, r2, lsl #1 | |
1278 mov r2, #8 | |
10616 | 1279 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1280 vpop {d8-d15} |
10385 | 1281 add sp, r11, #8 |
1282 pop {r11, pc} | |
11443 | 1283 endfunc |
8338 | 1284 |
10616 | 1285 function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
8338 | 1286 push {r0, r1, r4, r10, r11, lr} |
10616 | 1287 \type\()_h264_qpel8_mc21: |
8338 | 1288 lowpass_const r3 |
1289 mov r11, sp | |
1290 bic sp, sp, #15 | |
1291 sub sp, sp, #(8*8+16*12) | |
1292 sub r1, r1, #2 | |
1293 mov r3, #8 | |
1294 mov r0, sp | |
1295 mov ip, #8 | |
1296 vpush {d8-d15} | |
1297 bl put_h264_qpel8_h_lowpass_neon | |
1298 mov r4, r0 | |
1299 ldrd r0, [r11] | |
1300 sub r1, r1, r2, lsl #1 | |
1301 sub r1, r1, #2 | |
1302 mov r3, r2 | |
1303 sub r2, r4, #64 | |
10616 | 1304 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1305 vpop {d8-d15} |
1306 add sp, r11, #8 | |
1307 pop {r4, r10, r11, pc} | |
11443 | 1308 endfunc |
8338 | 1309 |
10616 | 1310 function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
8338 | 1311 add r1, r1, #1 |
10385 | 1312 push {r0, r1, r11, lr} |
8338 | 1313 sub r1, r1, #1 |
10616 | 1314 b \type\()_h264_qpel8_mc11 |
11443 | 1315 endfunc |
8338 | 1316 |
10616 | 1317 function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
8338 | 1318 push {lr} |
1319 lowpass_const r3 | |
1320 sub r1, r1, r2, lsl #1 | |
1321 mov r3, r2 | |
1322 vpush {d8-d15} | |
10616 | 1323 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 1324 vpop {d8-d15} |
1325 pop {pc} | |
11443 | 1326 endfunc |
8338 | 1327 |
10616 | 1328 function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
8338 | 1329 push {r0, r1, r4, r10, r11, lr} |
10616 | 1330 \type\()_h264_qpel8_mc12: |
8338 | 1331 lowpass_const r3 |
1332 mov r11, sp | |
1333 bic sp, sp, #15 | |
1334 sub sp, sp, #(8*8+16*12) | |
1335 sub r1, r1, r2, lsl #1 | |
1336 mov r3, r2 | |
1337 mov r2, #8 | |
1338 mov r0, sp | |
1339 vpush {d8-d15} | |
1340 bl put_h264_qpel8_v_lowpass_neon | |
1341 mov r4, r0 | |
1342 ldrd r0, [r11] | |
1343 sub r1, r1, r3, lsl #1 | |
1344 sub r1, r1, #2 | |
1345 sub r2, r4, #64 | |
10616 | 1346 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1347 vpop {d8-d15} |
1348 add sp, r11, #8 | |
1349 pop {r4, r10, r11, pc} | |
11443 | 1350 endfunc |
8338 | 1351 |
10616 | 1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
8338 | 1353 push {r4, r10, r11, lr} |
1354 mov r11, sp | |
1355 bic sp, sp, #15 | |
1356 sub r1, r1, r2, lsl #1 | |
1357 sub r1, r1, #2 | |
1358 mov r3, r2 | |
1359 sub sp, sp, #(16*12) | |
1360 mov r4, sp | |
1361 vpush {d8-d15} | |
10616 | 1362 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1363 vpop {d8-d15} |
1364 mov sp, r11 | |
1365 pop {r4, r10, r11, pc} | |
11443 | 1366 endfunc |
8338 | 1367 |
10616 | 1368 function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
8338 | 1369 push {r0, r1, r4, r10, r11, lr} |
1370 add r1, r1, #1 | |
10616 | 1371 b \type\()_h264_qpel8_mc12 |
11443 | 1372 endfunc |
8338 | 1373 |
10616 | 1374 function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
8338 | 1375 push {lr} |
1376 add ip, r1, r2 | |
10616 | 1377 b \type\()_h264_qpel8_mc01 |
11443 | 1378 endfunc |
8338 | 1379 |
10616 | 1380 function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
10385 | 1381 push {r0, r1, r11, lr} |
8338 | 1382 add r1, r1, r2 |
10616 | 1383 b \type\()_h264_qpel8_mc11 |
11443 | 1384 endfunc |
8338 | 1385 |
10616 | 1386 function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
8338 | 1387 push {r0, r1, r4, r10, r11, lr} |
1388 add r1, r1, r2 | |
10616 | 1389 b \type\()_h264_qpel8_mc21 |
11443 | 1390 endfunc |
8338 | 1391 |
10616 | 1392 function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
8338 | 1393 add r1, r1, #1 |
10385 | 1394 push {r0, r1, r11, lr} |
8338 | 1395 add r1, r1, r2 |
1396 sub r1, r1, #1 | |
10616 | 1397 b \type\()_h264_qpel8_mc11 |
11443 | 1398 endfunc |
10616 | 1399 .endm |
8338 | 1400 |
10616 | 1401 h264_qpel8 put |
1402 h264_qpel8 avg | |
1403 | |
1404 .macro h264_qpel16 type | |
1405 function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
8338 | 1406 lowpass_const r3 |
1407 mov r3, r1 | |
1408 sub r1, r1, #2 | |
10616 | 1409 b \type\()_h264_qpel16_h_lowpass_l2_neon |
11443 | 1410 endfunc |
8338 | 1411 |
10616 | 1412 function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
8338 | 1413 lowpass_const r3 |
1414 sub r1, r1, #2 | |
1415 mov r3, r2 | |
10616 | 1416 b \type\()_h264_qpel16_h_lowpass_neon |
11443 | 1417 endfunc |
8338 | 1418 |
10616 | 1419 function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
8338 | 1420 lowpass_const r3 |
1421 add r3, r1, #1 | |
1422 sub r1, r1, #2 | |
10616 | 1423 b \type\()_h264_qpel16_h_lowpass_l2_neon |
11443 | 1424 endfunc |
8338 | 1425 |
10616 | 1426 function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
8338 | 1427 push {r4, lr} |
1428 mov ip, r1 | |
10616 | 1429 \type\()_h264_qpel16_mc01: |
8338 | 1430 lowpass_const r3 |
1431 mov r3, r2 | |
1432 sub r1, r1, r2, lsl #1 | |
1433 vpush {d8-d15} | |
10616 | 1434 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1435 vpop {d8-d15} |
1436 pop {r4, pc} | |
11443 | 1437 endfunc |
8338 | 1438 |
10616 | 1439 function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
10385 | 1440 push {r0, r1, r4, r11, lr} |
10616 | 1441 \type\()_h264_qpel16_mc11: |
8338 | 1442 lowpass_const r3 |
10385 | 1443 mov r11, sp |
1444 bic sp, sp, #15 | |
8338 | 1445 sub sp, sp, #256 |
1446 mov r0, sp | |
1447 sub r1, r1, #2 | |
1448 mov r3, #16 | |
1449 vpush {d8-d15} | |
1450 bl put_h264_qpel16_h_lowpass_neon | |
10385 | 1451 ldrd r0, [r11] |
8338 | 1452 mov r3, r2 |
1453 add ip, sp, #64 | |
1454 sub r1, r1, r2, lsl #1 | |
1455 mov r2, #16 | |
10616 | 1456 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1457 vpop {d8-d15} |
10385 | 1458 add sp, r11, #8 |
1459 pop {r4, r11, pc} | |
11443 | 1460 endfunc |
8338 | 1461 |
10616 | 1462 function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
8338 | 1463 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1464 \type\()_h264_qpel16_mc21: |
8338 | 1465 lowpass_const r3 |
1466 mov r11, sp | |
1467 bic sp, sp, #15 | |
1468 sub sp, sp, #(16*16+16*12) | |
1469 sub r1, r1, #2 | |
1470 mov r0, sp | |
1471 vpush {d8-d15} | |
1472 bl put_h264_qpel16_h_lowpass_neon_packed | |
1473 mov r4, r0 | |
1474 ldrd r0, [r11] | |
1475 sub r1, r1, r2, lsl #1 | |
1476 sub r1, r1, #2 | |
1477 mov r3, r2 | |
10616 | 1478 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1479 vpop {d8-d15} |
1480 add sp, r11, #8 | |
1481 pop {r4-r5, r9-r11, pc} | |
11443 | 1482 endfunc |
8338 | 1483 |
10616 | 1484 function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
8338 | 1485 add r1, r1, #1 |
10385 | 1486 push {r0, r1, r4, r11, lr} |
8338 | 1487 sub r1, r1, #1 |
10616 | 1488 b \type\()_h264_qpel16_mc11 |
11443 | 1489 endfunc |
8338 | 1490 |
10616 | 1491 function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
8338 | 1492 push {r4, lr} |
1493 lowpass_const r3 | |
1494 sub r1, r1, r2, lsl #1 | |
1495 mov r3, r2 | |
1496 vpush {d8-d15} | |
10616 | 1497 bl \type\()_h264_qpel16_v_lowpass_neon |
8338 | 1498 vpop {d8-d15} |
1499 pop {r4, pc} | |
11443 | 1500 endfunc |
8338 | 1501 |
10616 | 1502 function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
8338 | 1503 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1504 \type\()_h264_qpel16_mc12: |
8338 | 1505 lowpass_const r3 |
1506 mov r11, sp | |
1507 bic sp, sp, #15 | |
1508 sub sp, sp, #(16*16+16*12) | |
1509 sub r1, r1, r2, lsl #1 | |
1510 mov r0, sp | |
1511 mov r3, r2 | |
1512 vpush {d8-d15} | |
1513 bl put_h264_qpel16_v_lowpass_neon_packed | |
1514 mov r4, r0 | |
1515 ldrd r0, [r11] | |
1516 sub r1, r1, r3, lsl #1 | |
1517 sub r1, r1, #2 | |
1518 mov r2, r3 | |
10616 | 1519 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1520 vpop {d8-d15} |
1521 add sp, r11, #8 | |
1522 pop {r4-r5, r9-r11, pc} | |
11443 | 1523 endfunc |
8338 | 1524 |
10616 | 1525 function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
8338 | 1526 push {r4, r9-r11, lr} |
1527 lowpass_const r3 | |
1528 mov r11, sp | |
1529 bic sp, sp, #15 | |
1530 sub r1, r1, r2, lsl #1 | |
1531 sub r1, r1, #2 | |
1532 mov r3, r2 | |
1533 sub sp, sp, #(16*12) | |
1534 mov r4, sp | |
1535 vpush {d8-d15} | |
10616 | 1536 bl \type\()_h264_qpel16_hv_lowpass_neon |
8338 | 1537 vpop {d8-d15} |
1538 mov sp, r11 | |
1539 pop {r4, r9-r11, pc} | |
11443 | 1540 endfunc |
8338 | 1541 |
10616 | 1542 function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
8338 | 1543 push {r0, r1, r4-r5, r9-r11, lr} |
1544 add r1, r1, #1 | |
10616 | 1545 b \type\()_h264_qpel16_mc12 |
11443 | 1546 endfunc |
8338 | 1547 |
10616 | 1548 function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
8338 | 1549 push {r4, lr} |
1550 add ip, r1, r2 | |
10616 | 1551 b \type\()_h264_qpel16_mc01 |
11443 | 1552 endfunc |
8338 | 1553 |
10616 | 1554 function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
10385 | 1555 push {r0, r1, r4, r11, lr} |
8338 | 1556 add r1, r1, r2 |
10616 | 1557 b \type\()_h264_qpel16_mc11 |
11443 | 1558 endfunc |
8338 | 1559 |
10616 | 1560 function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
8338 | 1561 push {r0, r1, r4-r5, r9-r11, lr} |
1562 add r1, r1, r2 | |
10616 | 1563 b \type\()_h264_qpel16_mc21 |
11443 | 1564 endfunc |
8338 | 1565 |
10616 | 1566 function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
8338 | 1567 add r1, r1, #1 |
10385 | 1568 push {r0, r1, r4, r11, lr} |
8338 | 1569 add r1, r1, r2 |
1570 sub r1, r1, #1 | |
10616 | 1571 b \type\()_h264_qpel16_mc11 |
11443 | 1572 endfunc |
10616 | 1573 .endm |
1574 | |
1575 h264_qpel16 put | |
1576 h264_qpel16 avg | |
8663 | 1577 |
1578 @ Biweighted prediction | |
1579 | |
1580 .macro biweight_16 macs, macd | |
1581 vdup.8 d0, r4 | |
1582 vdup.8 d1, r5 | |
1583 vmov q2, q8 | |
1584 vmov q3, q8 | |
1585 1: subs ip, ip, #2 | |
1586 vld1.8 {d20-d21},[r0,:128], r2 | |
1587 \macd q2, d0, d20 | |
1588 pld [r0] | |
1589 \macd q3, d0, d21 | |
1590 vld1.8 {d22-d23},[r1,:128], r2 | |
1591 \macs q2, d1, d22 | |
1592 pld [r1] | |
1593 \macs q3, d1, d23 | |
1594 vmov q12, q8 | |
1595 vld1.8 {d28-d29},[r0,:128], r2 | |
1596 vmov q13, q8 | |
1597 \macd q12, d0, d28 | |
1598 pld [r0] | |
1599 \macd q13, d0, d29 | |
1600 vld1.8 {d30-d31},[r1,:128], r2 | |
1601 \macs q12, d1, d30 | |
1602 pld [r1] | |
1603 \macs q13, d1, d31 | |
1604 vshl.s16 q2, q2, q9 | |
1605 vshl.s16 q3, q3, q9 | |
1606 vqmovun.s16 d4, q2 | |
1607 vqmovun.s16 d5, q3 | |
1608 vshl.s16 q12, q12, q9 | |
1609 vshl.s16 q13, q13, q9 | |
1610 vqmovun.s16 d24, q12 | |
1611 vqmovun.s16 d25, q13 | |
1612 vmov q3, q8 | |
1613 vst1.8 {d4- d5}, [r6,:128], r2 | |
1614 vmov q2, q8 | |
1615 vst1.8 {d24-d25},[r6,:128], r2 | |
1616 bne 1b | |
1617 pop {r4-r6, pc} | |
1618 .endm | |
1619 | |
1620 .macro biweight_8 macs, macd | |
1621 vdup.8 d0, r4 | |
1622 vdup.8 d1, r5 | |
1623 vmov q1, q8 | |
1624 vmov q10, q8 | |
1625 1: subs ip, ip, #2 | |
1626 vld1.8 {d4},[r0,:64], r2 | |
1627 \macd q1, d0, d4 | |
1628 pld [r0] | |
1629 vld1.8 {d5},[r1,:64], r2 | |
1630 \macs q1, d1, d5 | |
1631 pld [r1] | |
1632 vld1.8 {d6},[r0,:64], r2 | |
1633 \macd q10, d0, d6 | |
1634 pld [r0] | |
1635 vld1.8 {d7},[r1,:64], r2 | |
1636 \macs q10, d1, d7 | |
1637 pld [r1] | |
1638 vshl.s16 q1, q1, q9 | |
1639 vqmovun.s16 d2, q1 | |
1640 vshl.s16 q10, q10, q9 | |
1641 vqmovun.s16 d4, q10 | |
1642 vmov q10, q8 | |
1643 vst1.8 {d2},[r6,:64], r2 | |
1644 vmov q1, q8 | |
1645 vst1.8 {d4},[r6,:64], r2 | |
1646 bne 1b | |
1647 pop {r4-r6, pc} | |
1648 .endm | |
1649 | |
1650 .macro biweight_4 macs, macd | |
1651 vdup.8 d0, r4 | |
1652 vdup.8 d1, r5 | |
1653 vmov q1, q8 | |
1654 vmov q10, q8 | |
1655 1: subs ip, ip, #4 | |
1656 vld1.32 {d4[0]},[r0,:32], r2 | |
1657 vld1.32 {d4[1]},[r0,:32], r2 | |
1658 \macd q1, d0, d4 | |
1659 pld [r0] | |
1660 vld1.32 {d5[0]},[r1,:32], r2 | |
1661 vld1.32 {d5[1]},[r1,:32], r2 | |
1662 \macs q1, d1, d5 | |
1663 pld [r1] | |
1664 blt 2f | |
1665 vld1.32 {d6[0]},[r0,:32], r2 | |
1666 vld1.32 {d6[1]},[r0,:32], r2 | |
1667 \macd q10, d0, d6 | |
1668 pld [r0] | |
1669 vld1.32 {d7[0]},[r1,:32], r2 | |
1670 vld1.32 {d7[1]},[r1,:32], r2 | |
1671 \macs q10, d1, d7 | |
1672 pld [r1] | |
1673 vshl.s16 q1, q1, q9 | |
1674 vqmovun.s16 d2, q1 | |
1675 vshl.s16 q10, q10, q9 | |
1676 vqmovun.s16 d4, q10 | |
1677 vmov q10, q8 | |
1678 vst1.32 {d2[0]},[r6,:32], r2 | |
1679 vst1.32 {d2[1]},[r6,:32], r2 | |
1680 vmov q1, q8 | |
1681 vst1.32 {d4[0]},[r6,:32], r2 | |
1682 vst1.32 {d4[1]},[r6,:32], r2 | |
1683 bne 1b | |
1684 pop {r4-r6, pc} | |
1685 2: vshl.s16 q1, q1, q9 | |
1686 vqmovun.s16 d2, q1 | |
1687 vst1.32 {d2[0]},[r6,:32], r2 | |
1688 vst1.32 {d2[1]},[r6,:32], r2 | |
1689 pop {r4-r6, pc} | |
1690 .endm | |
1691 | |
1692 .macro biweight_func w | |
1693 function biweight_h264_pixels_\w\()_neon | |
1694 push {r4-r6, lr} | |
1695 add r4, sp, #16 | |
1696 ldm r4, {r4-r6} | |
1697 lsr lr, r4, #31 | |
1698 add r6, r6, #1 | |
1699 eors lr, lr, r5, lsr #30 | |
1700 orr r6, r6, #1 | |
1701 vdup.16 q9, r3 | |
1702 lsl r6, r6, r3 | |
1703 vmvn q9, q9 | |
1704 vdup.16 q8, r6 | |
1705 mov r6, r0 | |
1706 beq 10f | |
1707 subs lr, lr, #1 | |
1708 beq 20f | |
1709 subs lr, lr, #1 | |
1710 beq 30f | |
1711 b 40f | |
1712 10: biweight_\w vmlal.u8, vmlal.u8 | |
1713 20: rsb r4, r4, #0 | |
1714 biweight_\w vmlal.u8, vmlsl.u8 | |
1715 30: rsb r4, r4, #0 | |
1716 rsb r5, r5, #0 | |
1717 biweight_\w vmlsl.u8, vmlsl.u8 | |
1718 40: rsb r5, r5, #0 | |
1719 biweight_\w vmlsl.u8, vmlal.u8 | |
11443 | 1720 endfunc |
8663 | 1721 .endm |
1722 | |
1723 .macro biweight_entry w, h, b=1 | |
1724 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1725 mov ip, #\h | |
1726 .if \b | |
1727 b biweight_h264_pixels_\w\()_neon | |
1728 .endif | |
11443 | 1729 endfunc |
8663 | 1730 .endm |
1731 | |
1732 biweight_entry 16, 8 | |
1733 biweight_entry 16, 16, b=0 | |
1734 biweight_func 16 | |
1735 | |
1736 biweight_entry 8, 16 | |
1737 biweight_entry 8, 4 | |
1738 biweight_entry 8, 8, b=0 | |
1739 biweight_func 8 | |
1740 | |
1741 biweight_entry 4, 8 | |
1742 biweight_entry 4, 2 | |
1743 biweight_entry 4, 4, b=0 | |
1744 biweight_func 4 | |
8664 | 1745 |
1746 @ Weighted prediction | |
1747 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1748 .macro weight_16 add |
8664 | 1749 vdup.8 d0, r3 |
1750 1: subs ip, ip, #2 | |
1751 vld1.8 {d20-d21},[r0,:128], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1752 vmull.u8 q2, d0, d20 |
8664 | 1753 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1754 vmull.u8 q3, d0, d21 |
8664 | 1755 vld1.8 {d28-d29},[r0,:128], r1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1756 vmull.u8 q12, d0, d28 |
8664 | 1757 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1758 vmull.u8 q13, d0, d29 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1759 \add q2, q8, q2 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1760 vrshl.s16 q2, q2, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1761 \add q3, q8, q3 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1762 vrshl.s16 q3, q3, q9 |
8664 | 1763 vqmovun.s16 d4, q2 |
1764 vqmovun.s16 d5, q3 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1765 \add q12, q8, q12 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1766 vrshl.s16 q12, q12, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1767 \add q13, q8, q13 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1768 vrshl.s16 q13, q13, q9 |
8664 | 1769 vqmovun.s16 d24, q12 |
1770 vqmovun.s16 d25, q13 | |
1771 vst1.8 {d4- d5}, [r4,:128], r1 | |
1772 vst1.8 {d24-d25},[r4,:128], r1 | |
1773 bne 1b | |
1774 pop {r4, pc} | |
1775 .endm | |
1776 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1777 .macro weight_8 add |
8664 | 1778 vdup.8 d0, r3 |
1779 1: subs ip, ip, #2 | |
1780 vld1.8 {d4},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1781 vmull.u8 q1, d0, d4 |
8664 | 1782 pld [r0] |
1783 vld1.8 {d6},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1784 vmull.u8 q10, d0, d6 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1785 \add q1, q8, q1 |
8664 | 1786 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1787 vrshl.s16 q1, q1, q9 |
8664 | 1788 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1789 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1790 vrshl.s16 q10, q10, q9 |
8664 | 1791 vqmovun.s16 d4, q10 |
1792 vst1.8 {d2},[r4,:64], r1 | |
1793 vst1.8 {d4},[r4,:64], r1 | |
1794 bne 1b | |
1795 pop {r4, pc} | |
1796 .endm | |
1797 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1798 .macro weight_4 add |
8664 | 1799 vdup.8 d0, r3 |
1800 vmov q1, q8 | |
1801 vmov q10, q8 | |
1802 1: subs ip, ip, #4 | |
1803 vld1.32 {d4[0]},[r0,:32], r1 | |
1804 vld1.32 {d4[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1805 vmull.u8 q1, d0, d4 |
8664 | 1806 pld [r0] |
1807 blt 2f | |
1808 vld1.32 {d6[0]},[r0,:32], r1 | |
1809 vld1.32 {d6[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1810 vmull.u8 q10, d0, d6 |
8664 | 1811 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1812 \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1813 vrshl.s16 q1, q1, q9 |
8664 | 1814 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1815 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1816 vrshl.s16 q10, q10, q9 |
8664 | 1817 vqmovun.s16 d4, q10 |
1818 vmov q10, q8 | |
1819 vst1.32 {d2[0]},[r4,:32], r1 | |
1820 vst1.32 {d2[1]},[r4,:32], r1 | |
1821 vmov q1, q8 | |
1822 vst1.32 {d4[0]},[r4,:32], r1 | |
1823 vst1.32 {d4[1]},[r4,:32], r1 | |
1824 bne 1b | |
1825 pop {r4, pc} | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1826 2: \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1827 vrshl.s16 q1, q1, q9 |
8664 | 1828 vqmovun.s16 d2, q1 |
1829 vst1.32 {d2[0]},[r4,:32], r1 | |
1830 vst1.32 {d2[1]},[r4,:32], r1 | |
1831 pop {r4, pc} | |
1832 .endm | |
1833 | |
1834 .macro weight_func w | |
1835 function weight_h264_pixels_\w\()_neon | |
1836 push {r4, lr} | |
1837 ldr r4, [sp, #8] | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1838 cmp r2, #1 |
8664 | 1839 lsl r4, r4, r2 |
1840 vdup.16 q8, r4 | |
1841 mov r4, r0 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1842 ble 20f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1843 rsb lr, r2, #1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1844 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1845 cmp r3, #0 |
8664 | 1846 blt 10f |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1847 weight_\w vhadd.s16 |
8664 | 1848 10: rsb r3, r3, #0 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1849 weight_\w vhsub.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1850 20: rsb lr, r2, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1851 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1852 cmp r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1853 blt 10f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1854 weight_\w vadd.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1855 10: rsb r3, r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1856 weight_\w vsub.s16 |
11443 | 1857 endfunc |
8664 | 1858 .endm |
1859 | |
1860 .macro weight_entry w, h, b=1 | |
1861 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1862 mov ip, #\h | |
1863 .if \b | |
1864 b weight_h264_pixels_\w\()_neon | |
1865 .endif | |
11443 | 1866 endfunc |
8664 | 1867 .endm |
1868 | |
1869 weight_entry 16, 8 | |
1870 weight_entry 16, 16, b=0 | |
1871 weight_func 16 | |
1872 | |
1873 weight_entry 8, 16 | |
1874 weight_entry 8, 4 | |
1875 weight_entry 8, 8, b=0 | |
1876 weight_func 8 | |
1877 | |
1878 weight_entry 4, 8 | |
1879 weight_entry 4, 2 | |
1880 weight_entry 4, 4, b=0 | |
1881 weight_func 4 |