Mercurial > libavcodec.hg
annotate arm/h264dsp_neon.S @ 10616:d3b98479ef62 libavcodec
ARM: NEON 16x16 and 8x8 avg qpel MC
author | mru |
---|---|
date | Wed, 02 Dec 2009 00:37:33 +0000 |
parents | bc98e5724513 |
children | 5506cbb012b4 |
rev | line source |
---|---|
8336 | 1 /* |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
8338 | 23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 |
24 vtrn.32 \r0, \r4 | |
25 vtrn.32 \r1, \r5 | |
26 vtrn.32 \r2, \r6 | |
27 vtrn.32 \r3, \r7 | |
28 vtrn.16 \r0, \r2 | |
29 vtrn.16 \r1, \r3 | |
30 vtrn.16 \r4, \r6 | |
31 vtrn.16 \r5, \r7 | |
32 vtrn.8 \r0, \r1 | |
33 vtrn.8 \r2, \r3 | |
34 vtrn.8 \r4, \r5 | |
35 vtrn.8 \r6, \r7 | |
36 .endm | |
37 | |
9864 | 38 .macro transpose_4x4 r0 r1 r2 r3 |
39 vtrn.16 \r0, \r2 | |
40 vtrn.16 \r1, \r3 | |
41 vtrn.8 \r0, \r1 | |
42 vtrn.8 \r2, \r3 | |
43 .endm | |
44 | |
8338 | 45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 |
46 vswp \r0, \r4 | |
47 vswp \r1, \r5 | |
48 vswp \r2, \r6 | |
49 vswp \r3, \r7 | |
50 .endm | |
51 | |
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
53 vtrn.32 \r0, \r2 | |
54 vtrn.32 \r1, \r3 | |
55 vtrn.32 \r4, \r6 | |
56 vtrn.32 \r5, \r7 | |
57 vtrn.16 \r0, \r1 | |
58 vtrn.16 \r2, \r3 | |
59 vtrn.16 \r4, \r5 | |
60 vtrn.16 \r6, \r7 | |
61 .endm | |
62 | |
8336 | 63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
64 .macro h264_chroma_mc8 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
65 function ff_\type\()_h264_chroma_mc8_neon, export=1 |
8336 | 66 push {r4-r7, lr} |
67 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
68 .ifc \type,avg |
8336 | 69 mov lr, r0 |
70 .endif | |
71 pld [r1] | |
72 pld [r1, r2] | |
73 | |
74 muls r7, r4, r5 | |
75 rsb r6, r7, r5, lsl #3 | |
76 rsb ip, r7, r4, lsl #3 | |
77 sub r4, r7, r4, lsl #3 | |
78 sub r4, r4, r5, lsl #3 | |
79 add r4, r4, #64 | |
80 | |
81 beq 2f | |
82 | |
83 add r5, r1, r2 | |
84 | |
85 vdup.8 d0, r4 | |
86 lsl r4, r2, #1 | |
87 vdup.8 d1, ip | |
88 vld1.64 {d4, d5}, [r1], r4 | |
89 vdup.8 d2, r6 | |
90 vld1.64 {d6, d7}, [r5], r4 | |
91 vdup.8 d3, r7 | |
92 | |
93 vext.8 d5, d4, d5, #1 | |
94 vext.8 d7, d6, d7, #1 | |
95 | |
96 1: pld [r5] | |
97 vmull.u8 q8, d4, d0 | |
98 vmlal.u8 q8, d5, d1 | |
99 vld1.64 {d4, d5}, [r1], r4 | |
100 vmlal.u8 q8, d6, d2 | |
101 vext.8 d5, d4, d5, #1 | |
102 vmlal.u8 q8, d7, d3 | |
103 vmull.u8 q9, d6, d0 | |
104 subs r3, r3, #2 | |
105 vmlal.u8 q9, d7, d1 | |
106 vmlal.u8 q9, d4, d2 | |
107 vmlal.u8 q9, d5, d3 | |
108 vrshrn.u16 d16, q8, #6 | |
109 vld1.64 {d6, d7}, [r5], r4 | |
110 pld [r1] | |
111 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
112 .ifc \type,avg |
8336 | 113 vld1.64 {d20}, [lr,:64], r2 |
114 vld1.64 {d21}, [lr,:64], r2 | |
115 vrhadd.u8 q8, q8, q10 | |
116 .endif | |
117 vext.8 d7, d6, d7, #1 | |
118 vst1.64 {d16}, [r0,:64], r2 | |
119 vst1.64 {d17}, [r0,:64], r2 | |
120 bgt 1b | |
121 | |
122 pop {r4-r7, pc} | |
123 | |
124 2: tst r6, r6 | |
125 add ip, ip, r6 | |
126 vdup.8 d0, r4 | |
127 vdup.8 d1, ip | |
128 | |
129 beq 4f | |
130 | |
131 add r5, r1, r2 | |
132 lsl r4, r2, #1 | |
133 vld1.64 {d4}, [r1], r4 | |
134 vld1.64 {d6}, [r5], r4 | |
135 | |
136 3: pld [r5] | |
137 vmull.u8 q8, d4, d0 | |
138 vmlal.u8 q8, d6, d1 | |
139 vld1.64 {d4}, [r1], r4 | |
140 vmull.u8 q9, d6, d0 | |
141 vmlal.u8 q9, d4, d1 | |
142 vld1.64 {d6}, [r5], r4 | |
143 vrshrn.u16 d16, q8, #6 | |
144 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
145 .ifc \type,avg |
8336 | 146 vld1.64 {d20}, [lr,:64], r2 |
147 vld1.64 {d21}, [lr,:64], r2 | |
148 vrhadd.u8 q8, q8, q10 | |
149 .endif | |
150 subs r3, r3, #2 | |
151 pld [r1] | |
152 vst1.64 {d16}, [r0,:64], r2 | |
153 vst1.64 {d17}, [r0,:64], r2 | |
154 bgt 3b | |
155 | |
156 pop {r4-r7, pc} | |
157 | |
158 4: vld1.64 {d4, d5}, [r1], r2 | |
159 vld1.64 {d6, d7}, [r1], r2 | |
160 vext.8 d5, d4, d5, #1 | |
161 vext.8 d7, d6, d7, #1 | |
162 | |
163 5: pld [r1] | |
164 subs r3, r3, #2 | |
165 vmull.u8 q8, d4, d0 | |
166 vmlal.u8 q8, d5, d1 | |
167 vld1.64 {d4, d5}, [r1], r2 | |
168 vmull.u8 q9, d6, d0 | |
169 vmlal.u8 q9, d7, d1 | |
170 pld [r1] | |
171 vext.8 d5, d4, d5, #1 | |
172 vrshrn.u16 d16, q8, #6 | |
173 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
174 .ifc \type,avg |
8336 | 175 vld1.64 {d20}, [lr,:64], r2 |
176 vld1.64 {d21}, [lr,:64], r2 | |
177 vrhadd.u8 q8, q8, q10 | |
178 .endif | |
179 vld1.64 {d6, d7}, [r1], r2 | |
180 vext.8 d7, d6, d7, #1 | |
181 vst1.64 {d16}, [r0,:64], r2 | |
182 vst1.64 {d17}, [r0,:64], r2 | |
183 bgt 5b | |
184 | |
185 pop {r4-r7, pc} | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
186 .endfunc |
8336 | 187 .endm |
188 | |
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
190 .macro h264_chroma_mc4 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
191 function ff_\type\()_h264_chroma_mc4_neon, export=1 |
8336 | 192 push {r4-r7, lr} |
193 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
194 .ifc \type,avg |
8336 | 195 mov lr, r0 |
196 .endif | |
197 pld [r1] | |
198 pld [r1, r2] | |
199 | |
200 muls r7, r4, r5 | |
201 rsb r6, r7, r5, lsl #3 | |
202 rsb ip, r7, r4, lsl #3 | |
203 sub r4, r7, r4, lsl #3 | |
204 sub r4, r4, r5, lsl #3 | |
205 add r4, r4, #64 | |
206 | |
207 beq 2f | |
208 | |
209 add r5, r1, r2 | |
210 | |
211 vdup.8 d0, r4 | |
212 lsl r4, r2, #1 | |
213 vdup.8 d1, ip | |
214 vld1.64 {d4}, [r1], r4 | |
215 vdup.8 d2, r6 | |
216 vld1.64 {d6}, [r5], r4 | |
217 vdup.8 d3, r7 | |
218 | |
219 vext.8 d5, d4, d5, #1 | |
220 vext.8 d7, d6, d7, #1 | |
221 vtrn.32 d4, d5 | |
222 vtrn.32 d6, d7 | |
223 | |
224 vtrn.32 d0, d1 | |
225 vtrn.32 d2, d3 | |
226 | |
227 1: pld [r5] | |
228 vmull.u8 q8, d4, d0 | |
229 vmlal.u8 q8, d6, d2 | |
230 vld1.64 {d4}, [r1], r4 | |
231 vext.8 d5, d4, d5, #1 | |
232 vtrn.32 d4, d5 | |
233 vmull.u8 q9, d6, d0 | |
234 vmlal.u8 q9, d4, d2 | |
235 vld1.64 {d6}, [r5], r4 | |
236 vadd.i16 d16, d16, d17 | |
237 vadd.i16 d17, d18, d19 | |
238 vrshrn.u16 d16, q8, #6 | |
239 subs r3, r3, #2 | |
240 pld [r1] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
241 .ifc \type,avg |
8336 | 242 vld1.32 {d20[0]}, [lr,:32], r2 |
243 vld1.32 {d20[1]}, [lr,:32], r2 | |
244 vrhadd.u8 d16, d16, d20 | |
245 .endif | |
246 vext.8 d7, d6, d7, #1 | |
247 vtrn.32 d6, d7 | |
248 vst1.32 {d16[0]}, [r0,:32], r2 | |
249 vst1.32 {d16[1]}, [r0,:32], r2 | |
250 bgt 1b | |
251 | |
252 pop {r4-r7, pc} | |
253 | |
254 2: tst r6, r6 | |
255 add ip, ip, r6 | |
256 vdup.8 d0, r4 | |
257 vdup.8 d1, ip | |
258 vtrn.32 d0, d1 | |
259 | |
260 beq 4f | |
261 | |
262 vext.32 d1, d0, d1, #1 | |
263 add r5, r1, r2 | |
264 lsl r4, r2, #1 | |
265 vld1.32 {d4[0]}, [r1], r4 | |
266 vld1.32 {d4[1]}, [r5], r4 | |
267 | |
268 3: pld [r5] | |
269 vmull.u8 q8, d4, d0 | |
270 vld1.32 {d4[0]}, [r1], r4 | |
271 vmull.u8 q9, d4, d1 | |
272 vld1.32 {d4[1]}, [r5], r4 | |
273 vadd.i16 d16, d16, d17 | |
274 vadd.i16 d17, d18, d19 | |
275 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
276 .ifc \type,avg |
8336 | 277 vld1.32 {d20[0]}, [lr,:32], r2 |
278 vld1.32 {d20[1]}, [lr,:32], r2 | |
279 vrhadd.u8 d16, d16, d20 | |
280 .endif | |
281 subs r3, r3, #2 | |
282 pld [r1] | |
283 vst1.32 {d16[0]}, [r0,:32], r2 | |
284 vst1.32 {d16[1]}, [r0,:32], r2 | |
285 bgt 3b | |
286 | |
287 pop {r4-r7, pc} | |
288 | |
289 4: vld1.64 {d4}, [r1], r2 | |
290 vld1.64 {d6}, [r1], r2 | |
291 vext.8 d5, d4, d5, #1 | |
292 vext.8 d7, d6, d7, #1 | |
293 vtrn.32 d4, d5 | |
294 vtrn.32 d6, d7 | |
295 | |
296 5: vmull.u8 q8, d4, d0 | |
297 vmull.u8 q9, d6, d0 | |
298 subs r3, r3, #2 | |
299 vld1.64 {d4}, [r1], r2 | |
300 vext.8 d5, d4, d5, #1 | |
301 vtrn.32 d4, d5 | |
302 vadd.i16 d16, d16, d17 | |
303 vadd.i16 d17, d18, d19 | |
304 pld [r1] | |
305 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
306 .ifc \type,avg |
8336 | 307 vld1.32 {d20[0]}, [lr,:32], r2 |
308 vld1.32 {d20[1]}, [lr,:32], r2 | |
309 vrhadd.u8 d16, d16, d20 | |
310 .endif | |
311 vld1.64 {d6}, [r1], r2 | |
312 vext.8 d7, d6, d7, #1 | |
313 vtrn.32 d6, d7 | |
314 pld [r1] | |
315 vst1.32 {d16[0]}, [r0,:32], r2 | |
316 vst1.32 {d16[1]}, [r0,:32], r2 | |
317 bgt 5b | |
318 | |
319 pop {r4-r7, pc} | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
320 .endfunc |
8336 | 321 .endm |
322 | |
323 .text | |
324 .align | |
325 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
326 h264_chroma_mc8 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
327 h264_chroma_mc8 avg |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
328 h264_chroma_mc4 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
329 h264_chroma_mc4 avg |
8337 | 330 |
331 /* H.264 loop filter */ | |
332 | |
333 .macro h264_loop_filter_start | |
334 ldr ip, [sp] | |
335 tst r2, r2 | |
336 ldr ip, [ip] | |
337 tstne r3, r3 | |
338 vmov.32 d24[0], ip | |
339 and ip, ip, ip, lsl #16 | |
340 bxeq lr | |
341 ands ip, ip, ip, lsl #8 | |
342 bxlt lr | |
343 .endm | |
344 | |
345 .macro align_push_regs | |
346 and ip, sp, #15 | |
347 add ip, ip, #32 | |
348 sub sp, sp, ip | |
349 vst1.64 {d12-d15}, [sp,:128] | |
350 sub sp, sp, #32 | |
351 vst1.64 {d8-d11}, [sp,:128] | |
352 .endm | |
353 | |
354 .macro align_pop_regs | |
355 vld1.64 {d8-d11}, [sp,:128]! | |
356 vld1.64 {d12-d15}, [sp,:128], ip | |
357 .endm | |
358 | |
359 .macro h264_loop_filter_luma | |
360 vdup.8 q11, r2 @ alpha | |
361 vmovl.u8 q12, d24 | |
362 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
363 vmovl.u16 q12, d24 | |
364 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
365 vsli.16 q12, q12, #8 | |
366 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
367 vsli.32 q12, q12, #16 | |
368 vclt.u8 q6, q6, q11 @ < alpha | |
369 vdup.8 q11, r3 @ beta | |
370 vclt.s8 q7, q12, #0 | |
371 vclt.u8 q14, q14, q11 @ < beta | |
372 vclt.u8 q15, q15, q11 @ < beta | |
373 vbic q6, q6, q7 | |
374 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
375 vand q6, q6, q14 | |
376 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
377 vclt.u8 q4, q4, q11 @ < beta | |
378 vand q6, q6, q15 | |
379 vclt.u8 q5, q5, q11 @ < beta | |
380 vand q4, q4, q6 | |
381 vand q5, q5, q6 | |
382 vand q12, q12, q6 | |
383 vrhadd.u8 q14, q8, q0 | |
384 vsub.i8 q6, q12, q4 | |
385 vqadd.u8 q7, q9, q12 | |
386 vhadd.u8 q10, q10, q14 | |
387 vsub.i8 q6, q6, q5 | |
388 vhadd.u8 q14, q2, q14 | |
389 vmin.u8 q7, q7, q10 | |
390 vqsub.u8 q11, q9, q12 | |
391 vqadd.u8 q2, q1, q12 | |
392 vmax.u8 q7, q7, q11 | |
393 vqsub.u8 q11, q1, q12 | |
394 vmin.u8 q14, q2, q14 | |
395 vmovl.u8 q2, d0 | |
396 vmax.u8 q14, q14, q11 | |
397 vmovl.u8 q10, d1 | |
398 vsubw.u8 q2, q2, d16 | |
399 vsubw.u8 q10, q10, d17 | |
400 vshl.i16 q2, q2, #2 | |
401 vshl.i16 q10, q10, #2 | |
402 vaddw.u8 q2, q2, d18 | |
403 vaddw.u8 q10, q10, d19 | |
404 vsubw.u8 q2, q2, d2 | |
405 vsubw.u8 q10, q10, d3 | |
406 vrshrn.i16 d4, q2, #3 | |
407 vrshrn.i16 d5, q10, #3 | |
408 vbsl q4, q7, q9 | |
409 vbsl q5, q14, q1 | |
410 vneg.s8 q7, q6 | |
411 vmovl.u8 q14, d16 | |
412 vmin.s8 q2, q2, q6 | |
413 vmovl.u8 q6, d17 | |
414 vmax.s8 q2, q2, q7 | |
415 vmovl.u8 q11, d0 | |
416 vmovl.u8 q12, d1 | |
417 vaddw.s8 q14, q14, d4 | |
418 vaddw.s8 q6, q6, d5 | |
419 vsubw.s8 q11, q11, d4 | |
420 vsubw.s8 q12, q12, d5 | |
421 vqmovun.s16 d16, q14 | |
422 vqmovun.s16 d17, q6 | |
423 vqmovun.s16 d0, q11 | |
424 vqmovun.s16 d1, q12 | |
425 .endm | |
426 | |
427 function ff_h264_v_loop_filter_luma_neon, export=1 | |
428 h264_loop_filter_start | |
429 | |
430 vld1.64 {d0, d1}, [r0,:128], r1 | |
431 vld1.64 {d2, d3}, [r0,:128], r1 | |
432 vld1.64 {d4, d5}, [r0,:128], r1 | |
433 sub r0, r0, r1, lsl #2 | |
434 sub r0, r0, r1, lsl #1 | |
435 vld1.64 {d20,d21}, [r0,:128], r1 | |
436 vld1.64 {d18,d19}, [r0,:128], r1 | |
437 vld1.64 {d16,d17}, [r0,:128], r1 | |
438 | |
439 align_push_regs | |
440 | |
441 h264_loop_filter_luma | |
442 | |
443 sub r0, r0, r1, lsl #1 | |
444 vst1.64 {d8, d9}, [r0,:128], r1 | |
445 vst1.64 {d16,d17}, [r0,:128], r1 | |
446 vst1.64 {d0, d1}, [r0,:128], r1 | |
447 vst1.64 {d10,d11}, [r0,:128] | |
448 | |
449 align_pop_regs | |
450 bx lr | |
451 .endfunc | |
452 | |
453 function ff_h264_h_loop_filter_luma_neon, export=1 | |
454 h264_loop_filter_start | |
455 | |
456 sub r0, r0, #4 | |
457 vld1.64 {d6}, [r0], r1 | |
458 vld1.64 {d20}, [r0], r1 | |
459 vld1.64 {d18}, [r0], r1 | |
460 vld1.64 {d16}, [r0], r1 | |
461 vld1.64 {d0}, [r0], r1 | |
462 vld1.64 {d2}, [r0], r1 | |
463 vld1.64 {d4}, [r0], r1 | |
464 vld1.64 {d26}, [r0], r1 | |
465 vld1.64 {d7}, [r0], r1 | |
466 vld1.64 {d21}, [r0], r1 | |
467 vld1.64 {d19}, [r0], r1 | |
468 vld1.64 {d17}, [r0], r1 | |
469 vld1.64 {d1}, [r0], r1 | |
470 vld1.64 {d3}, [r0], r1 | |
471 vld1.64 {d5}, [r0], r1 | |
472 vld1.64 {d27}, [r0], r1 | |
473 | |
8338 | 474 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 |
8337 | 475 |
476 align_push_regs | |
477 | |
478 h264_loop_filter_luma | |
479 | |
9864 | 480 transpose_4x4 q4, q8, q0, q5 |
8337 | 481 |
482 sub r0, r0, r1, lsl #4 | |
9864 | 483 add r0, r0, #2 |
484 vst1.32 {d8[0]}, [r0], r1 | |
485 vst1.32 {d16[0]}, [r0], r1 | |
486 vst1.32 {d0[0]}, [r0], r1 | |
487 vst1.32 {d10[0]}, [r0], r1 | |
488 vst1.32 {d8[1]}, [r0], r1 | |
489 vst1.32 {d16[1]}, [r0], r1 | |
490 vst1.32 {d0[1]}, [r0], r1 | |
491 vst1.32 {d10[1]}, [r0], r1 | |
492 vst1.32 {d9[0]}, [r0], r1 | |
493 vst1.32 {d17[0]}, [r0], r1 | |
494 vst1.32 {d1[0]}, [r0], r1 | |
495 vst1.32 {d11[0]}, [r0], r1 | |
496 vst1.32 {d9[1]}, [r0], r1 | |
497 vst1.32 {d17[1]}, [r0], r1 | |
498 vst1.32 {d1[1]}, [r0], r1 | |
499 vst1.32 {d11[1]}, [r0], r1 | |
8337 | 500 |
501 align_pop_regs | |
502 bx lr | |
503 .endfunc | |
504 | |
505 .macro h264_loop_filter_chroma | |
506 vdup.8 d22, r2 @ alpha | |
507 vmovl.u8 q12, d24 | |
508 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
509 vmovl.u8 q2, d0 | |
510 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
511 vsubw.u8 q2, q2, d16 | |
512 vsli.16 d24, d24, #8 | |
513 vshl.i16 q2, q2, #2 | |
514 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
515 vaddw.u8 q2, q2, d18 | |
516 vclt.u8 d26, d26, d22 @ < alpha | |
517 vsubw.u8 q2, q2, d2 | |
518 vdup.8 d22, r3 @ beta | |
519 vclt.s8 d25, d24, #0 | |
520 vrshrn.i16 d4, q2, #3 | |
521 vclt.u8 d28, d28, d22 @ < beta | |
522 vbic d26, d26, d25 | |
523 vclt.u8 d30, d30, d22 @ < beta | |
524 vand d26, d26, d28 | |
525 vneg.s8 d25, d24 | |
526 vand d26, d26, d30 | |
527 vmin.s8 d4, d4, d24 | |
528 vmovl.u8 q14, d16 | |
529 vand d4, d4, d26 | |
530 vmax.s8 d4, d4, d25 | |
531 vmovl.u8 q11, d0 | |
532 vaddw.s8 q14, q14, d4 | |
533 vsubw.s8 q11, q11, d4 | |
534 vqmovun.s16 d16, q14 | |
535 vqmovun.s16 d0, q11 | |
536 .endm | |
537 | |
538 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
539 h264_loop_filter_start | |
540 | |
541 sub r0, r0, r1, lsl #1 | |
542 vld1.64 {d18}, [r0,:64], r1 | |
543 vld1.64 {d16}, [r0,:64], r1 | |
544 vld1.64 {d0}, [r0,:64], r1 | |
545 vld1.64 {d2}, [r0,:64] | |
546 | |
547 h264_loop_filter_chroma | |
548 | |
549 sub r0, r0, r1, lsl #1 | |
550 vst1.64 {d16}, [r0,:64], r1 | |
551 vst1.64 {d0}, [r0,:64], r1 | |
552 | |
553 bx lr | |
554 .endfunc | |
555 | |
556 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
557 h264_loop_filter_start | |
558 | |
559 sub r0, r0, #2 | |
560 vld1.32 {d18[0]}, [r0], r1 | |
561 vld1.32 {d16[0]}, [r0], r1 | |
562 vld1.32 {d0[0]}, [r0], r1 | |
563 vld1.32 {d2[0]}, [r0], r1 | |
564 vld1.32 {d18[1]}, [r0], r1 | |
565 vld1.32 {d16[1]}, [r0], r1 | |
566 vld1.32 {d0[1]}, [r0], r1 | |
567 vld1.32 {d2[1]}, [r0], r1 | |
568 | |
569 vtrn.16 d18, d0 | |
570 vtrn.16 d16, d2 | |
571 vtrn.8 d18, d16 | |
572 vtrn.8 d0, d2 | |
573 | |
574 h264_loop_filter_chroma | |
575 | |
576 vtrn.16 d18, d0 | |
577 vtrn.16 d16, d2 | |
578 vtrn.8 d18, d16 | |
579 vtrn.8 d0, d2 | |
580 | |
581 sub r0, r0, r1, lsl #3 | |
582 vst1.32 {d18[0]}, [r0], r1 | |
583 vst1.32 {d16[0]}, [r0], r1 | |
584 vst1.32 {d0[0]}, [r0], r1 | |
585 vst1.32 {d2[0]}, [r0], r1 | |
586 vst1.32 {d18[1]}, [r0], r1 | |
587 vst1.32 {d16[1]}, [r0], r1 | |
588 vst1.32 {d0[1]}, [r0], r1 | |
589 vst1.32 {d2[1]}, [r0], r1 | |
590 | |
591 bx lr | |
592 .endfunc | |
8338 | 593 |
594 /* H.264 qpel MC */ | |
595 | |
596 .macro lowpass_const r | |
597 movw \r, #5 | |
598 movt \r, #20 | |
599 vmov.32 d6[0], \r | |
600 .endm | |
601 | |
602 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
603 .if \narrow | |
604 t0 .req q0 | |
605 t1 .req q8 | |
606 .else | |
607 t0 .req \d0 | |
608 t1 .req \d1 | |
609 .endif | |
610 vext.8 d2, \r0, \r1, #2 | |
611 vext.8 d3, \r0, \r1, #3 | |
612 vaddl.u8 q1, d2, d3 | |
613 vext.8 d4, \r0, \r1, #1 | |
614 vext.8 d5, \r0, \r1, #4 | |
615 vaddl.u8 q2, d4, d5 | |
616 vext.8 d30, \r0, \r1, #5 | |
617 vaddl.u8 t0, \r0, d30 | |
618 vext.8 d18, \r2, \r3, #2 | |
619 vmla.i16 t0, q1, d6[1] | |
620 vext.8 d19, \r2, \r3, #3 | |
621 vaddl.u8 q9, d18, d19 | |
622 vext.8 d20, \r2, \r3, #1 | |
623 vmls.i16 t0, q2, d6[0] | |
624 vext.8 d21, \r2, \r3, #4 | |
625 vaddl.u8 q10, d20, d21 | |
626 vext.8 d31, \r2, \r3, #5 | |
627 vaddl.u8 t1, \r2, d31 | |
628 vmla.i16 t1, q9, d6[1] | |
629 vmls.i16 t1, q10, d6[0] | |
630 .if \narrow | |
631 vqrshrun.s16 \d0, t0, #5 | |
632 vqrshrun.s16 \d1, t1, #5 | |
633 .endif | |
634 .unreq t0 | |
635 .unreq t1 | |
636 .endm | |
637 | |
638 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
639 .if \narrow | |
640 t0 .req q0 | |
641 .else | |
642 t0 .req \d0 | |
643 .endif | |
644 vext.8 d2, \r0, \r1, #2 | |
645 vext.8 d3, \r0, \r1, #3 | |
646 vaddl.u8 q1, d2, d3 | |
647 vext.8 d4, \r0, \r1, #1 | |
648 vext.8 d5, \r0, \r1, #4 | |
649 vaddl.u8 q2, d4, d5 | |
650 vext.8 d30, \r0, \r1, #5 | |
651 vaddl.u8 t0, \r0, d30 | |
652 vmla.i16 t0, q1, d6[1] | |
653 vmls.i16 t0, q2, d6[0] | |
654 .if \narrow | |
655 vqrshrun.s16 \d0, t0, #5 | |
656 .endif | |
657 .unreq t0 | |
658 .endm | |
659 | |
660 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
661 vext.16 q1, \r0, \r1, #2 | |
662 vext.16 q0, \r0, \r1, #3 | |
663 vaddl.s16 q9, d2, d0 | |
664 vext.16 q2, \r0, \r1, #1 | |
665 vaddl.s16 q1, d3, d1 | |
666 vext.16 q3, \r0, \r1, #4 | |
667 vaddl.s16 q10, d4, d6 | |
668 vext.16 \r1, \r0, \r1, #5 | |
669 vaddl.s16 q2, d5, d7 | |
670 vaddl.s16 q0, \h0, \h1 | |
671 vaddl.s16 q8, \l0, \l1 | |
672 | |
673 vshl.i32 q3, q9, #4 | |
674 vshl.i32 q9, q9, #2 | |
675 vshl.i32 q15, q10, #2 | |
676 vadd.i32 q9, q9, q3 | |
677 vadd.i32 q10, q10, q15 | |
678 | |
679 vshl.i32 q3, q1, #4 | |
680 vshl.i32 q1, q1, #2 | |
681 vshl.i32 q15, q2, #2 | |
682 vadd.i32 q1, q1, q3 | |
683 vadd.i32 q2, q2, q15 | |
684 | |
685 vadd.i32 q9, q9, q8 | |
686 vsub.i32 q9, q9, q10 | |
687 | |
688 vadd.i32 q1, q1, q0 | |
689 vsub.i32 q1, q1, q2 | |
690 | |
691 vrshrn.s32 d18, q9, #10 | |
692 vrshrn.s32 d19, q1, #10 | |
693 | |
694 vqmovun.s16 \d, q9 | |
695 .endm | |
696 | |
697 function put_h264_qpel16_h_lowpass_neon_packed | |
698 mov r4, lr | |
699 mov ip, #16 | |
700 mov r3, #8 | |
701 bl put_h264_qpel8_h_lowpass_neon | |
702 sub r1, r1, r2, lsl #4 | |
703 add r1, r1, #8 | |
704 mov ip, #16 | |
705 mov lr, r4 | |
706 b put_h264_qpel8_h_lowpass_neon | |
707 .endfunc | |
708 | |
10616 | 709 .macro h264_qpel_h_lowpass type |
710 function \type\()_h264_qpel16_h_lowpass_neon | |
8338 | 711 push {lr} |
712 mov ip, #16 | |
10616 | 713 bl \type\()_h264_qpel8_h_lowpass_neon |
8338 | 714 sub r0, r0, r3, lsl #4 |
715 sub r1, r1, r2, lsl #4 | |
716 add r0, r0, #8 | |
717 add r1, r1, #8 | |
718 mov ip, #16 | |
719 pop {lr} | |
720 .endfunc | |
721 | |
10616 | 722 function \type\()_h264_qpel8_h_lowpass_neon |
8338 | 723 1: vld1.64 {d0, d1}, [r1], r2 |
724 vld1.64 {d16,d17}, [r1], r2 | |
725 subs ip, ip, #2 | |
726 lowpass_8 d0, d1, d16, d17, d0, d16 | |
10616 | 727 .ifc \type,avg |
728 vld1.8 {d2}, [r0,:64], r3 | |
729 vrhadd.u8 d0, d0, d2 | |
730 vld1.8 {d3}, [r0,:64] | |
731 vrhadd.u8 d16, d16, d3 | |
732 sub r0, r0, r3 | |
733 .endif | |
8338 | 734 vst1.64 {d0}, [r0,:64], r3 |
735 vst1.64 {d16}, [r0,:64], r3 | |
736 bne 1b | |
737 bx lr | |
738 .endfunc | |
10616 | 739 .endm |
8338 | 740 |
10616 | 741 h264_qpel_h_lowpass put |
742 h264_qpel_h_lowpass avg | |
743 | |
744 .macro h264_qpel_h_lowpass_l2 type | |
745 function \type\()_h264_qpel16_h_lowpass_l2_neon | |
8338 | 746 push {lr} |
747 mov ip, #16 | |
10616 | 748 bl \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 749 sub r0, r0, r2, lsl #4 |
750 sub r1, r1, r2, lsl #4 | |
751 sub r3, r3, r2, lsl #4 | |
752 add r0, r0, #8 | |
753 add r1, r1, #8 | |
754 add r3, r3, #8 | |
755 mov ip, #16 | |
756 pop {lr} | |
757 .endfunc | |
758 | |
10616 | 759 function \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 760 1: vld1.64 {d0, d1}, [r1], r2 |
761 vld1.64 {d16,d17}, [r1], r2 | |
762 vld1.64 {d28}, [r3], r2 | |
763 vld1.64 {d29}, [r3], r2 | |
764 subs ip, ip, #2 | |
765 lowpass_8 d0, d1, d16, d17, d0, d1 | |
766 vrhadd.u8 q0, q0, q14 | |
10616 | 767 .ifc \type,avg |
768 vld1.8 {d2}, [r0,:64], r2 | |
769 vrhadd.u8 d0, d0, d2 | |
770 vld1.8 {d3}, [r0,:64] | |
771 vrhadd.u8 d1, d1, d3 | |
772 sub r0, r0, r2 | |
773 .endif | |
8338 | 774 vst1.64 {d0}, [r0,:64], r2 |
775 vst1.64 {d1}, [r0,:64], r2 | |
776 bne 1b | |
777 bx lr | |
778 .endfunc | |
10616 | 779 .endm |
780 | |
781 h264_qpel_h_lowpass_l2 put | |
782 h264_qpel_h_lowpass_l2 avg | |
8338 | 783 |
784 function put_h264_qpel16_v_lowpass_neon_packed | |
785 mov r4, lr | |
786 mov r2, #8 | |
787 bl put_h264_qpel8_v_lowpass_neon | |
788 sub r1, r1, r3, lsl #2 | |
789 bl put_h264_qpel8_v_lowpass_neon | |
790 sub r1, r1, r3, lsl #4 | |
791 sub r1, r1, r3, lsl #2 | |
792 add r1, r1, #8 | |
793 bl put_h264_qpel8_v_lowpass_neon | |
794 sub r1, r1, r3, lsl #2 | |
795 mov lr, r4 | |
796 b put_h264_qpel8_v_lowpass_neon | |
797 .endfunc | |
798 | |
10616 | 799 .macro h264_qpel_v_lowpass type |
800 function \type\()_h264_qpel16_v_lowpass_neon | |
8338 | 801 mov r4, lr |
10616 | 802 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 803 sub r1, r1, r3, lsl #2 |
10616 | 804 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 805 sub r0, r0, r2, lsl #4 |
806 add r0, r0, #8 | |
807 sub r1, r1, r3, lsl #4 | |
808 sub r1, r1, r3, lsl #2 | |
809 add r1, r1, #8 | |
10616 | 810 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 811 sub r1, r1, r3, lsl #2 |
812 mov lr, r4 | |
813 .endfunc | |
814 | |
10616 | 815 function \type\()_h264_qpel8_v_lowpass_neon |
8338 | 816 vld1.64 {d8}, [r1], r3 |
817 vld1.64 {d10}, [r1], r3 | |
818 vld1.64 {d12}, [r1], r3 | |
819 vld1.64 {d14}, [r1], r3 | |
820 vld1.64 {d22}, [r1], r3 | |
821 vld1.64 {d24}, [r1], r3 | |
822 vld1.64 {d26}, [r1], r3 | |
823 vld1.64 {d28}, [r1], r3 | |
824 vld1.64 {d9}, [r1], r3 | |
825 vld1.64 {d11}, [r1], r3 | |
826 vld1.64 {d13}, [r1], r3 | |
827 vld1.64 {d15}, [r1], r3 | |
828 vld1.64 {d23}, [r1] | |
829 | |
830 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
831 lowpass_8 d8, d9, d10, d11, d8, d10 | |
832 lowpass_8 d12, d13, d14, d15, d12, d14 | |
833 lowpass_8 d22, d23, d24, d25, d22, d24 | |
834 lowpass_8 d26, d27, d28, d29, d26, d28 | |
835 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
836 | |
10616 | 837 .ifc \type,avg |
838 vld1.8 {d9}, [r0,:64], r2 | |
839 vrhadd.u8 d8, d8, d9 | |
840 vld1.8 {d11}, [r0,:64], r2 | |
841 vrhadd.u8 d10, d10, d11 | |
842 vld1.8 {d13}, [r0,:64], r2 | |
843 vrhadd.u8 d12, d12, d13 | |
844 vld1.8 {d15}, [r0,:64], r2 | |
845 vrhadd.u8 d14, d14, d15 | |
846 vld1.8 {d23}, [r0,:64], r2 | |
847 vrhadd.u8 d22, d22, d23 | |
848 vld1.8 {d25}, [r0,:64], r2 | |
849 vrhadd.u8 d24, d24, d25 | |
850 vld1.8 {d27}, [r0,:64], r2 | |
851 vrhadd.u8 d26, d26, d27 | |
852 vld1.8 {d29}, [r0,:64], r2 | |
853 vrhadd.u8 d28, d28, d29 | |
854 sub r0, r0, r2, lsl #3 | |
855 .endif | |
856 | |
8338 | 857 vst1.64 {d8}, [r0,:64], r2 |
858 vst1.64 {d10}, [r0,:64], r2 | |
859 vst1.64 {d12}, [r0,:64], r2 | |
860 vst1.64 {d14}, [r0,:64], r2 | |
861 vst1.64 {d22}, [r0,:64], r2 | |
862 vst1.64 {d24}, [r0,:64], r2 | |
863 vst1.64 {d26}, [r0,:64], r2 | |
864 vst1.64 {d28}, [r0,:64], r2 | |
865 | |
866 bx lr | |
867 .endfunc | |
10616 | 868 .endm |
8338 | 869 |
10616 | 870 h264_qpel_v_lowpass put |
871 h264_qpel_v_lowpass avg | |
872 | |
873 .macro h264_qpel_v_lowpass_l2 type | |
874 function \type\()_h264_qpel16_v_lowpass_l2_neon | |
8338 | 875 mov r4, lr |
10616 | 876 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 877 sub r1, r1, r3, lsl #2 |
10616 | 878 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 879 sub r0, r0, r3, lsl #4 |
880 sub ip, ip, r2, lsl #4 | |
881 add r0, r0, #8 | |
882 add ip, ip, #8 | |
883 sub r1, r1, r3, lsl #4 | |
884 sub r1, r1, r3, lsl #2 | |
885 add r1, r1, #8 | |
10616 | 886 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 887 sub r1, r1, r3, lsl #2 |
888 mov lr, r4 | |
889 .endfunc | |
890 | |
10616 | 891 function \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 892 vld1.64 {d8}, [r1], r3 |
893 vld1.64 {d10}, [r1], r3 | |
894 vld1.64 {d12}, [r1], r3 | |
895 vld1.64 {d14}, [r1], r3 | |
896 vld1.64 {d22}, [r1], r3 | |
897 vld1.64 {d24}, [r1], r3 | |
898 vld1.64 {d26}, [r1], r3 | |
899 vld1.64 {d28}, [r1], r3 | |
900 vld1.64 {d9}, [r1], r3 | |
901 vld1.64 {d11}, [r1], r3 | |
902 vld1.64 {d13}, [r1], r3 | |
903 vld1.64 {d15}, [r1], r3 | |
904 vld1.64 {d23}, [r1] | |
905 | |
906 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
907 lowpass_8 d8, d9, d10, d11, d8, d9 | |
908 lowpass_8 d12, d13, d14, d15, d12, d13 | |
909 lowpass_8 d22, d23, d24, d25, d22, d23 | |
910 lowpass_8 d26, d27, d28, d29, d26, d27 | |
911 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
912 | |
913 vld1.64 {d0}, [ip], r2 | |
914 vld1.64 {d1}, [ip], r2 | |
915 vld1.64 {d2}, [ip], r2 | |
916 vld1.64 {d3}, [ip], r2 | |
917 vld1.64 {d4}, [ip], r2 | |
918 vrhadd.u8 q0, q0, q4 | |
919 vld1.64 {d5}, [ip], r2 | |
920 vrhadd.u8 q1, q1, q6 | |
921 vld1.64 {d10}, [ip], r2 | |
922 vrhadd.u8 q2, q2, q11 | |
923 vld1.64 {d11}, [ip], r2 | |
10616 | 924 vrhadd.u8 q5, q5, q13 |
925 | |
926 .ifc \type,avg | |
927 vld1.8 {d16}, [r0,:64], r3 | |
928 vrhadd.u8 d0, d0, d16 | |
929 vld1.8 {d17}, [r0,:64], r3 | |
930 vrhadd.u8 d1, d1, d17 | |
931 vld1.8 {d16}, [r0,:64], r3 | |
932 vrhadd.u8 d2, d2, d16 | |
933 vld1.8 {d17}, [r0,:64], r3 | |
934 vrhadd.u8 d3, d3, d17 | |
935 vld1.8 {d16}, [r0,:64], r3 | |
936 vrhadd.u8 d4, d4, d16 | |
937 vld1.8 {d17}, [r0,:64], r3 | |
938 vrhadd.u8 d5, d5, d17 | |
939 vld1.8 {d16}, [r0,:64], r3 | |
940 vrhadd.u8 d10, d10, d16 | |
941 vld1.8 {d17}, [r0,:64], r3 | |
942 vrhadd.u8 d11, d11, d17 | |
943 sub r0, r0, r3, lsl #3 | |
944 .endif | |
8338 | 945 |
946 vst1.64 {d0}, [r0,:64], r3 | |
947 vst1.64 {d1}, [r0,:64], r3 | |
948 vst1.64 {d2}, [r0,:64], r3 | |
949 vst1.64 {d3}, [r0,:64], r3 | |
950 vst1.64 {d4}, [r0,:64], r3 | |
951 vst1.64 {d5}, [r0,:64], r3 | |
952 vst1.64 {d10}, [r0,:64], r3 | |
953 vst1.64 {d11}, [r0,:64], r3 | |
954 | |
955 bx lr | |
956 .endfunc | |
10616 | 957 .endm |
958 | |
959 h264_qpel_v_lowpass_l2 put | |
960 h264_qpel_v_lowpass_l2 avg | |
8338 | 961 |
962 function put_h264_qpel8_hv_lowpass_neon_top | |
963 lowpass_const ip | |
964 mov ip, #12 | |
965 1: vld1.64 {d0, d1}, [r1], r3 | |
966 vld1.64 {d16,d17}, [r1], r3 | |
967 subs ip, ip, #2 | |
968 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
969 vst1.64 {d22-d25}, [r4,:128]! | |
970 bne 1b | |
971 | |
972 vld1.64 {d0, d1}, [r1] | |
973 lowpass_8_1 d0, d1, q12, narrow=0 | |
974 | |
975 mov ip, #-16 | |
976 add r4, r4, ip | |
977 vld1.64 {d30,d31}, [r4,:128], ip | |
978 vld1.64 {d20,d21}, [r4,:128], ip | |
979 vld1.64 {d18,d19}, [r4,:128], ip | |
980 vld1.64 {d16,d17}, [r4,:128], ip | |
981 vld1.64 {d14,d15}, [r4,:128], ip | |
982 vld1.64 {d12,d13}, [r4,:128], ip | |
983 vld1.64 {d10,d11}, [r4,:128], ip | |
984 vld1.64 {d8, d9}, [r4,:128], ip | |
985 vld1.64 {d6, d7}, [r4,:128], ip | |
986 vld1.64 {d4, d5}, [r4,:128], ip | |
987 vld1.64 {d2, d3}, [r4,:128], ip | |
988 vld1.64 {d0, d1}, [r4,:128] | |
989 | |
990 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
991 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
992 | |
993 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
994 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
995 | |
996 vst1.64 {d30,d31}, [r4,:128]! | |
997 vst1.64 {d6, d7}, [r4,:128]! | |
998 vst1.64 {d20,d21}, [r4,:128]! | |
999 vst1.64 {d4, d5}, [r4,:128]! | |
1000 vst1.64 {d18,d19}, [r4,:128]! | |
1001 vst1.64 {d2, d3}, [r4,:128]! | |
1002 vst1.64 {d16,d17}, [r4,:128]! | |
1003 vst1.64 {d0, d1}, [r4,:128] | |
1004 | |
1005 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
1006 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
1007 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
1008 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
1009 | |
1010 vld1.64 {d16,d17}, [r4,:128], ip | |
1011 vld1.64 {d30,d31}, [r4,:128], ip | |
1012 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
1013 vld1.64 {d16,d17}, [r4,:128], ip | |
1014 vld1.64 {d30,d31}, [r4,:128], ip | |
1015 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
1016 vld1.64 {d16,d17}, [r4,:128], ip | |
1017 vld1.64 {d30,d31}, [r4,:128], ip | |
1018 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
1019 vld1.64 {d16,d17}, [r4,:128], ip | |
1020 vld1.64 {d30,d31}, [r4,:128] | |
1021 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
1022 | |
1023 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
1024 | |
1025 bx lr | |
1026 .endfunc | |
1027 | |
10616 | 1028 .macro h264_qpel8_hv_lowpass type |
1029 function \type\()_h264_qpel8_hv_lowpass_neon | |
8338 | 1030 mov r10, lr |
1031 bl put_h264_qpel8_hv_lowpass_neon_top | |
10616 | 1032 .ifc \type,avg |
1033 vld1.8 {d0}, [r0,:64], r2 | |
1034 vrhadd.u8 d12, d12, d0 | |
1035 vld1.8 {d1}, [r0,:64], r2 | |
1036 vrhadd.u8 d13, d13, d1 | |
1037 vld1.8 {d2}, [r0,:64], r2 | |
1038 vrhadd.u8 d14, d14, d2 | |
1039 vld1.8 {d3}, [r0,:64], r2 | |
1040 vrhadd.u8 d15, d15, d3 | |
1041 vld1.8 {d4}, [r0,:64], r2 | |
1042 vrhadd.u8 d8, d8, d4 | |
1043 vld1.8 {d5}, [r0,:64], r2 | |
1044 vrhadd.u8 d9, d9, d5 | |
1045 vld1.8 {d6}, [r0,:64], r2 | |
1046 vrhadd.u8 d10, d10, d6 | |
1047 vld1.8 {d7}, [r0,:64], r2 | |
1048 vrhadd.u8 d11, d11, d7 | |
1049 sub r0, r0, r2, lsl #3 | |
1050 .endif | |
8338 | 1051 vst1.64 {d12}, [r0,:64], r2 |
1052 vst1.64 {d13}, [r0,:64], r2 | |
1053 vst1.64 {d14}, [r0,:64], r2 | |
1054 vst1.64 {d15}, [r0,:64], r2 | |
1055 vst1.64 {d8}, [r0,:64], r2 | |
1056 vst1.64 {d9}, [r0,:64], r2 | |
1057 vst1.64 {d10}, [r0,:64], r2 | |
1058 vst1.64 {d11}, [r0,:64], r2 | |
1059 | |
1060 mov lr, r10 | |
1061 bx lr | |
1062 .endfunc | |
10616 | 1063 .endm |
8338 | 1064 |
10616 | 1065 h264_qpel8_hv_lowpass put |
1066 h264_qpel8_hv_lowpass avg | |
1067 | |
1068 .macro h264_qpel8_hv_lowpass_l2 type | |
1069 function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
8338 | 1070 mov r10, lr |
1071 bl put_h264_qpel8_hv_lowpass_neon_top | |
1072 | |
1073 vld1.64 {d0, d1}, [r2,:128]! | |
1074 vld1.64 {d2, d3}, [r2,:128]! | |
1075 vrhadd.u8 q0, q0, q6 | |
1076 vld1.64 {d4, d5}, [r2,:128]! | |
1077 vrhadd.u8 q1, q1, q7 | |
1078 vld1.64 {d6, d7}, [r2,:128]! | |
1079 vrhadd.u8 q2, q2, q4 | |
10616 | 1080 vrhadd.u8 q3, q3, q5 |
1081 .ifc \type,avg | |
1082 vld1.8 {d16}, [r0,:64], r3 | |
1083 vrhadd.u8 d0, d0, d16 | |
1084 vld1.8 {d17}, [r0,:64], r3 | |
1085 vrhadd.u8 d1, d1, d17 | |
1086 vld1.8 {d18}, [r0,:64], r3 | |
1087 vrhadd.u8 d2, d2, d18 | |
1088 vld1.8 {d19}, [r0,:64], r3 | |
1089 vrhadd.u8 d3, d3, d19 | |
1090 vld1.8 {d20}, [r0,:64], r3 | |
1091 vrhadd.u8 d4, d4, d20 | |
1092 vld1.8 {d21}, [r0,:64], r3 | |
1093 vrhadd.u8 d5, d5, d21 | |
1094 vld1.8 {d22}, [r0,:64], r3 | |
1095 vrhadd.u8 d6, d6, d22 | |
1096 vld1.8 {d23}, [r0,:64], r3 | |
1097 vrhadd.u8 d7, d7, d23 | |
1098 sub r0, r0, r3, lsl #3 | |
1099 .endif | |
8338 | 1100 vst1.64 {d0}, [r0,:64], r3 |
1101 vst1.64 {d1}, [r0,:64], r3 | |
1102 vst1.64 {d2}, [r0,:64], r3 | |
1103 vst1.64 {d3}, [r0,:64], r3 | |
1104 vst1.64 {d4}, [r0,:64], r3 | |
1105 vst1.64 {d5}, [r0,:64], r3 | |
1106 vst1.64 {d6}, [r0,:64], r3 | |
1107 vst1.64 {d7}, [r0,:64], r3 | |
1108 | |
1109 mov lr, r10 | |
1110 bx lr | |
1111 .endfunc | |
10616 | 1112 .endm |
8338 | 1113 |
10616 | 1114 h264_qpel8_hv_lowpass_l2 put |
1115 h264_qpel8_hv_lowpass_l2 avg | |
1116 | |
1117 .macro h264_qpel16_hv type | |
1118 function \type\()_h264_qpel16_hv_lowpass_neon | |
8338 | 1119 mov r9, lr |
10616 | 1120 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1121 sub r1, r1, r3, lsl #2 |
10616 | 1122 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1123 sub r1, r1, r3, lsl #4 |
1124 sub r1, r1, r3, lsl #2 | |
1125 add r1, r1, #8 | |
1126 sub r0, r0, r2, lsl #4 | |
1127 add r0, r0, #8 | |
10616 | 1128 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1129 sub r1, r1, r3, lsl #2 |
1130 mov lr, r9 | |
10616 | 1131 b \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1132 .endfunc |
1133 | |
10616 | 1134 function \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1135 mov r9, lr |
1136 sub r2, r4, #256 | |
10616 | 1137 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1138 sub r1, r1, r3, lsl #2 |
10616 | 1139 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1140 sub r1, r1, r3, lsl #4 |
1141 sub r1, r1, r3, lsl #2 | |
1142 add r1, r1, #8 | |
1143 sub r0, r0, r3, lsl #4 | |
1144 add r0, r0, #8 | |
10616 | 1145 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1146 sub r1, r1, r3, lsl #2 |
1147 mov lr, r9 | |
10616 | 1148 b \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1149 .endfunc |
10616 | 1150 .endm |
8338 | 1151 |
10616 | 1152 h264_qpel16_hv put |
1153 h264_qpel16_hv avg | |
1154 | |
1155 .macro h264_qpel8 type | |
1156 function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
8338 | 1157 lowpass_const r3 |
1158 mov r3, r1 | |
1159 sub r1, r1, #2 | |
1160 mov ip, #8 | |
10616 | 1161 b \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 1162 .endfunc |
1163 | |
10616 | 1164 function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
8338 | 1165 lowpass_const r3 |
1166 sub r1, r1, #2 | |
1167 mov r3, r2 | |
1168 mov ip, #8 | |
10616 | 1169 b \type\()_h264_qpel8_h_lowpass_neon |
8338 | 1170 .endfunc |
1171 | |
10616 | 1172 function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
8338 | 1173 lowpass_const r3 |
1174 add r3, r1, #1 | |
1175 sub r1, r1, #2 | |
1176 mov ip, #8 | |
10616 | 1177 b \type\()_h264_qpel8_h_lowpass_l2_neon |
8338 | 1178 .endfunc |
1179 | |
10616 | 1180 function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
8338 | 1181 push {lr} |
1182 mov ip, r1 | |
10616 | 1183 \type\()_h264_qpel8_mc01: |
8338 | 1184 lowpass_const r3 |
1185 mov r3, r2 | |
1186 sub r1, r1, r2, lsl #1 | |
1187 vpush {d8-d15} | |
10616 | 1188 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1189 vpop {d8-d15} |
1190 pop {pc} | |
1191 .endfunc | |
1192 | |
10616 | 1193 function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
10385 | 1194 push {r0, r1, r11, lr} |
10616 | 1195 \type\()_h264_qpel8_mc11: |
8338 | 1196 lowpass_const r3 |
10385 | 1197 mov r11, sp |
1198 bic sp, sp, #15 | |
8338 | 1199 sub sp, sp, #64 |
1200 mov r0, sp | |
1201 sub r1, r1, #2 | |
1202 mov r3, #8 | |
1203 mov ip, #8 | |
1204 vpush {d8-d15} | |
1205 bl put_h264_qpel8_h_lowpass_neon | |
10385 | 1206 ldrd r0, [r11] |
8338 | 1207 mov r3, r2 |
1208 add ip, sp, #64 | |
1209 sub r1, r1, r2, lsl #1 | |
1210 mov r2, #8 | |
10616 | 1211 bl \type\()_h264_qpel8_v_lowpass_l2_neon |
8338 | 1212 vpop {d8-d15} |
10385 | 1213 add sp, r11, #8 |
1214 pop {r11, pc} | |
8338 | 1215 .endfunc |
1216 | |
10616 | 1217 function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
8338 | 1218 push {r0, r1, r4, r10, r11, lr} |
10616 | 1219 \type\()_h264_qpel8_mc21: |
8338 | 1220 lowpass_const r3 |
1221 mov r11, sp | |
1222 bic sp, sp, #15 | |
1223 sub sp, sp, #(8*8+16*12) | |
1224 sub r1, r1, #2 | |
1225 mov r3, #8 | |
1226 mov r0, sp | |
1227 mov ip, #8 | |
1228 vpush {d8-d15} | |
1229 bl put_h264_qpel8_h_lowpass_neon | |
1230 mov r4, r0 | |
1231 ldrd r0, [r11] | |
1232 sub r1, r1, r2, lsl #1 | |
1233 sub r1, r1, #2 | |
1234 mov r3, r2 | |
1235 sub r2, r4, #64 | |
10616 | 1236 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1237 vpop {d8-d15} |
1238 add sp, r11, #8 | |
1239 pop {r4, r10, r11, pc} | |
1240 .endfunc | |
1241 | |
10616 | 1242 function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
8338 | 1243 add r1, r1, #1 |
10385 | 1244 push {r0, r1, r11, lr} |
8338 | 1245 sub r1, r1, #1 |
10616 | 1246 b \type\()_h264_qpel8_mc11 |
8338 | 1247 .endfunc |
1248 | |
10616 | 1249 function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
8338 | 1250 push {lr} |
1251 lowpass_const r3 | |
1252 sub r1, r1, r2, lsl #1 | |
1253 mov r3, r2 | |
1254 vpush {d8-d15} | |
10616 | 1255 bl \type\()_h264_qpel8_v_lowpass_neon |
8338 | 1256 vpop {d8-d15} |
1257 pop {pc} | |
1258 .endfunc | |
1259 | |
10616 | 1260 function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
8338 | 1261 push {r0, r1, r4, r10, r11, lr} |
10616 | 1262 \type\()_h264_qpel8_mc12: |
8338 | 1263 lowpass_const r3 |
1264 mov r11, sp | |
1265 bic sp, sp, #15 | |
1266 sub sp, sp, #(8*8+16*12) | |
1267 sub r1, r1, r2, lsl #1 | |
1268 mov r3, r2 | |
1269 mov r2, #8 | |
1270 mov r0, sp | |
1271 vpush {d8-d15} | |
1272 bl put_h264_qpel8_v_lowpass_neon | |
1273 mov r4, r0 | |
1274 ldrd r0, [r11] | |
1275 sub r1, r1, r3, lsl #1 | |
1276 sub r1, r1, #2 | |
1277 sub r2, r4, #64 | |
10616 | 1278 bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
8338 | 1279 vpop {d8-d15} |
1280 add sp, r11, #8 | |
1281 pop {r4, r10, r11, pc} | |
1282 .endfunc | |
1283 | |
10616 | 1284 function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
8338 | 1285 push {r4, r10, r11, lr} |
1286 mov r11, sp | |
1287 bic sp, sp, #15 | |
1288 sub r1, r1, r2, lsl #1 | |
1289 sub r1, r1, #2 | |
1290 mov r3, r2 | |
1291 sub sp, sp, #(16*12) | |
1292 mov r4, sp | |
1293 vpush {d8-d15} | |
10616 | 1294 bl \type\()_h264_qpel8_hv_lowpass_neon |
8338 | 1295 vpop {d8-d15} |
1296 mov sp, r11 | |
1297 pop {r4, r10, r11, pc} | |
1298 .endfunc | |
1299 | |
10616 | 1300 function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
8338 | 1301 push {r0, r1, r4, r10, r11, lr} |
1302 add r1, r1, #1 | |
10616 | 1303 b \type\()_h264_qpel8_mc12 |
8338 | 1304 .endfunc |
1305 | |
10616 | 1306 function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
8338 | 1307 push {lr} |
1308 add ip, r1, r2 | |
10616 | 1309 b \type\()_h264_qpel8_mc01 |
8338 | 1310 .endfunc |
1311 | |
10616 | 1312 function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
10385 | 1313 push {r0, r1, r11, lr} |
8338 | 1314 add r1, r1, r2 |
10616 | 1315 b \type\()_h264_qpel8_mc11 |
8338 | 1316 .endfunc |
1317 | |
10616 | 1318 function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
8338 | 1319 push {r0, r1, r4, r10, r11, lr} |
1320 add r1, r1, r2 | |
10616 | 1321 b \type\()_h264_qpel8_mc21 |
8338 | 1322 .endfunc |
1323 | |
10616 | 1324 function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
8338 | 1325 add r1, r1, #1 |
10385 | 1326 push {r0, r1, r11, lr} |
8338 | 1327 add r1, r1, r2 |
1328 sub r1, r1, #1 | |
10616 | 1329 b \type\()_h264_qpel8_mc11 |
8338 | 1330 .endfunc |
10616 | 1331 .endm |
8338 | 1332 |
10616 | 1333 h264_qpel8 put |
1334 h264_qpel8 avg | |
1335 | |
1336 .macro h264_qpel16 type | |
1337 function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
8338 | 1338 lowpass_const r3 |
1339 mov r3, r1 | |
1340 sub r1, r1, #2 | |
10616 | 1341 b \type\()_h264_qpel16_h_lowpass_l2_neon |
8338 | 1342 .endfunc |
1343 | |
10616 | 1344 function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
8338 | 1345 lowpass_const r3 |
1346 sub r1, r1, #2 | |
1347 mov r3, r2 | |
10616 | 1348 b \type\()_h264_qpel16_h_lowpass_neon |
8338 | 1349 .endfunc |
1350 | |
10616 | 1351 function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
8338 | 1352 lowpass_const r3 |
1353 add r3, r1, #1 | |
1354 sub r1, r1, #2 | |
10616 | 1355 b \type\()_h264_qpel16_h_lowpass_l2_neon |
8338 | 1356 .endfunc |
1357 | |
10616 | 1358 function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
8338 | 1359 push {r4, lr} |
1360 mov ip, r1 | |
10616 | 1361 \type\()_h264_qpel16_mc01: |
8338 | 1362 lowpass_const r3 |
1363 mov r3, r2 | |
1364 sub r1, r1, r2, lsl #1 | |
1365 vpush {d8-d15} | |
10616 | 1366 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1367 vpop {d8-d15} |
1368 pop {r4, pc} | |
1369 .endfunc | |
1370 | |
10616 | 1371 function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
10385 | 1372 push {r0, r1, r4, r11, lr} |
10616 | 1373 \type\()_h264_qpel16_mc11: |
8338 | 1374 lowpass_const r3 |
10385 | 1375 mov r11, sp |
1376 bic sp, sp, #15 | |
8338 | 1377 sub sp, sp, #256 |
1378 mov r0, sp | |
1379 sub r1, r1, #2 | |
1380 mov r3, #16 | |
1381 vpush {d8-d15} | |
1382 bl put_h264_qpel16_h_lowpass_neon | |
10385 | 1383 ldrd r0, [r11] |
8338 | 1384 mov r3, r2 |
1385 add ip, sp, #64 | |
1386 sub r1, r1, r2, lsl #1 | |
1387 mov r2, #16 | |
10616 | 1388 bl \type\()_h264_qpel16_v_lowpass_l2_neon |
8338 | 1389 vpop {d8-d15} |
10385 | 1390 add sp, r11, #8 |
1391 pop {r4, r11, pc} | |
8338 | 1392 .endfunc |
1393 | |
10616 | 1394 function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
8338 | 1395 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1396 \type\()_h264_qpel16_mc21: |
8338 | 1397 lowpass_const r3 |
1398 mov r11, sp | |
1399 bic sp, sp, #15 | |
1400 sub sp, sp, #(16*16+16*12) | |
1401 sub r1, r1, #2 | |
1402 mov r0, sp | |
1403 vpush {d8-d15} | |
1404 bl put_h264_qpel16_h_lowpass_neon_packed | |
1405 mov r4, r0 | |
1406 ldrd r0, [r11] | |
1407 sub r1, r1, r2, lsl #1 | |
1408 sub r1, r1, #2 | |
1409 mov r3, r2 | |
10616 | 1410 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1411 vpop {d8-d15} |
1412 add sp, r11, #8 | |
1413 pop {r4-r5, r9-r11, pc} | |
1414 .endfunc | |
1415 | |
10616 | 1416 function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
8338 | 1417 add r1, r1, #1 |
10385 | 1418 push {r0, r1, r4, r11, lr} |
8338 | 1419 sub r1, r1, #1 |
10616 | 1420 b \type\()_h264_qpel16_mc11 |
8338 | 1421 .endfunc |
1422 | |
10616 | 1423 function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
8338 | 1424 push {r4, lr} |
1425 lowpass_const r3 | |
1426 sub r1, r1, r2, lsl #1 | |
1427 mov r3, r2 | |
1428 vpush {d8-d15} | |
10616 | 1429 bl \type\()_h264_qpel16_v_lowpass_neon |
8338 | 1430 vpop {d8-d15} |
1431 pop {r4, pc} | |
1432 .endfunc | |
1433 | |
10616 | 1434 function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
8338 | 1435 push {r0, r1, r4-r5, r9-r11, lr} |
10616 | 1436 \type\()_h264_qpel16_mc12: |
8338 | 1437 lowpass_const r3 |
1438 mov r11, sp | |
1439 bic sp, sp, #15 | |
1440 sub sp, sp, #(16*16+16*12) | |
1441 sub r1, r1, r2, lsl #1 | |
1442 mov r0, sp | |
1443 mov r3, r2 | |
1444 vpush {d8-d15} | |
1445 bl put_h264_qpel16_v_lowpass_neon_packed | |
1446 mov r4, r0 | |
1447 ldrd r0, [r11] | |
1448 sub r1, r1, r3, lsl #1 | |
1449 sub r1, r1, #2 | |
1450 mov r2, r3 | |
10616 | 1451 bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
8338 | 1452 vpop {d8-d15} |
1453 add sp, r11, #8 | |
1454 pop {r4-r5, r9-r11, pc} | |
1455 .endfunc | |
1456 | |
10616 | 1457 function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
8338 | 1458 push {r4, r9-r11, lr} |
1459 lowpass_const r3 | |
1460 mov r11, sp | |
1461 bic sp, sp, #15 | |
1462 sub r1, r1, r2, lsl #1 | |
1463 sub r1, r1, #2 | |
1464 mov r3, r2 | |
1465 sub sp, sp, #(16*12) | |
1466 mov r4, sp | |
1467 vpush {d8-d15} | |
10616 | 1468 bl \type\()_h264_qpel16_hv_lowpass_neon |
8338 | 1469 vpop {d8-d15} |
1470 mov sp, r11 | |
1471 pop {r4, r9-r11, pc} | |
1472 .endfunc | |
1473 | |
10616 | 1474 function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
8338 | 1475 push {r0, r1, r4-r5, r9-r11, lr} |
1476 add r1, r1, #1 | |
10616 | 1477 b \type\()_h264_qpel16_mc12 |
8338 | 1478 .endfunc |
1479 | |
10616 | 1480 function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
8338 | 1481 push {r4, lr} |
1482 add ip, r1, r2 | |
10616 | 1483 b \type\()_h264_qpel16_mc01 |
8338 | 1484 .endfunc |
1485 | |
10616 | 1486 function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
10385 | 1487 push {r0, r1, r4, r11, lr} |
8338 | 1488 add r1, r1, r2 |
10616 | 1489 b \type\()_h264_qpel16_mc11 |
8338 | 1490 .endfunc |
1491 | |
10616 | 1492 function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
8338 | 1493 push {r0, r1, r4-r5, r9-r11, lr} |
1494 add r1, r1, r2 | |
10616 | 1495 b \type\()_h264_qpel16_mc21 |
8338 | 1496 .endfunc |
1497 | |
10616 | 1498 function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
8338 | 1499 add r1, r1, #1 |
10385 | 1500 push {r0, r1, r4, r11, lr} |
8338 | 1501 add r1, r1, r2 |
1502 sub r1, r1, #1 | |
10616 | 1503 b \type\()_h264_qpel16_mc11 |
8338 | 1504 .endfunc |
10616 | 1505 .endm |
1506 | |
1507 h264_qpel16 put | |
1508 h264_qpel16 avg | |
8663 | 1509 |
1510 @ Biweighted prediction | |
1511 | |
1512 .macro biweight_16 macs, macd | |
1513 vdup.8 d0, r4 | |
1514 vdup.8 d1, r5 | |
1515 vmov q2, q8 | |
1516 vmov q3, q8 | |
1517 1: subs ip, ip, #2 | |
1518 vld1.8 {d20-d21},[r0,:128], r2 | |
1519 \macd q2, d0, d20 | |
1520 pld [r0] | |
1521 \macd q3, d0, d21 | |
1522 vld1.8 {d22-d23},[r1,:128], r2 | |
1523 \macs q2, d1, d22 | |
1524 pld [r1] | |
1525 \macs q3, d1, d23 | |
1526 vmov q12, q8 | |
1527 vld1.8 {d28-d29},[r0,:128], r2 | |
1528 vmov q13, q8 | |
1529 \macd q12, d0, d28 | |
1530 pld [r0] | |
1531 \macd q13, d0, d29 | |
1532 vld1.8 {d30-d31},[r1,:128], r2 | |
1533 \macs q12, d1, d30 | |
1534 pld [r1] | |
1535 \macs q13, d1, d31 | |
1536 vshl.s16 q2, q2, q9 | |
1537 vshl.s16 q3, q3, q9 | |
1538 vqmovun.s16 d4, q2 | |
1539 vqmovun.s16 d5, q3 | |
1540 vshl.s16 q12, q12, q9 | |
1541 vshl.s16 q13, q13, q9 | |
1542 vqmovun.s16 d24, q12 | |
1543 vqmovun.s16 d25, q13 | |
1544 vmov q3, q8 | |
1545 vst1.8 {d4- d5}, [r6,:128], r2 | |
1546 vmov q2, q8 | |
1547 vst1.8 {d24-d25},[r6,:128], r2 | |
1548 bne 1b | |
1549 pop {r4-r6, pc} | |
1550 .endm | |
1551 | |
1552 .macro biweight_8 macs, macd | |
1553 vdup.8 d0, r4 | |
1554 vdup.8 d1, r5 | |
1555 vmov q1, q8 | |
1556 vmov q10, q8 | |
1557 1: subs ip, ip, #2 | |
1558 vld1.8 {d4},[r0,:64], r2 | |
1559 \macd q1, d0, d4 | |
1560 pld [r0] | |
1561 vld1.8 {d5},[r1,:64], r2 | |
1562 \macs q1, d1, d5 | |
1563 pld [r1] | |
1564 vld1.8 {d6},[r0,:64], r2 | |
1565 \macd q10, d0, d6 | |
1566 pld [r0] | |
1567 vld1.8 {d7},[r1,:64], r2 | |
1568 \macs q10, d1, d7 | |
1569 pld [r1] | |
1570 vshl.s16 q1, q1, q9 | |
1571 vqmovun.s16 d2, q1 | |
1572 vshl.s16 q10, q10, q9 | |
1573 vqmovun.s16 d4, q10 | |
1574 vmov q10, q8 | |
1575 vst1.8 {d2},[r6,:64], r2 | |
1576 vmov q1, q8 | |
1577 vst1.8 {d4},[r6,:64], r2 | |
1578 bne 1b | |
1579 pop {r4-r6, pc} | |
1580 .endm | |
1581 | |
1582 .macro biweight_4 macs, macd | |
1583 vdup.8 d0, r4 | |
1584 vdup.8 d1, r5 | |
1585 vmov q1, q8 | |
1586 vmov q10, q8 | |
1587 1: subs ip, ip, #4 | |
1588 vld1.32 {d4[0]},[r0,:32], r2 | |
1589 vld1.32 {d4[1]},[r0,:32], r2 | |
1590 \macd q1, d0, d4 | |
1591 pld [r0] | |
1592 vld1.32 {d5[0]},[r1,:32], r2 | |
1593 vld1.32 {d5[1]},[r1,:32], r2 | |
1594 \macs q1, d1, d5 | |
1595 pld [r1] | |
1596 blt 2f | |
1597 vld1.32 {d6[0]},[r0,:32], r2 | |
1598 vld1.32 {d6[1]},[r0,:32], r2 | |
1599 \macd q10, d0, d6 | |
1600 pld [r0] | |
1601 vld1.32 {d7[0]},[r1,:32], r2 | |
1602 vld1.32 {d7[1]},[r1,:32], r2 | |
1603 \macs q10, d1, d7 | |
1604 pld [r1] | |
1605 vshl.s16 q1, q1, q9 | |
1606 vqmovun.s16 d2, q1 | |
1607 vshl.s16 q10, q10, q9 | |
1608 vqmovun.s16 d4, q10 | |
1609 vmov q10, q8 | |
1610 vst1.32 {d2[0]},[r6,:32], r2 | |
1611 vst1.32 {d2[1]},[r6,:32], r2 | |
1612 vmov q1, q8 | |
1613 vst1.32 {d4[0]},[r6,:32], r2 | |
1614 vst1.32 {d4[1]},[r6,:32], r2 | |
1615 bne 1b | |
1616 pop {r4-r6, pc} | |
1617 2: vshl.s16 q1, q1, q9 | |
1618 vqmovun.s16 d2, q1 | |
1619 vst1.32 {d2[0]},[r6,:32], r2 | |
1620 vst1.32 {d2[1]},[r6,:32], r2 | |
1621 pop {r4-r6, pc} | |
1622 .endm | |
1623 | |
1624 .macro biweight_func w | |
1625 function biweight_h264_pixels_\w\()_neon | |
1626 push {r4-r6, lr} | |
1627 add r4, sp, #16 | |
1628 ldm r4, {r4-r6} | |
1629 lsr lr, r4, #31 | |
1630 add r6, r6, #1 | |
1631 eors lr, lr, r5, lsr #30 | |
1632 orr r6, r6, #1 | |
1633 vdup.16 q9, r3 | |
1634 lsl r6, r6, r3 | |
1635 vmvn q9, q9 | |
1636 vdup.16 q8, r6 | |
1637 mov r6, r0 | |
1638 beq 10f | |
1639 subs lr, lr, #1 | |
1640 beq 20f | |
1641 subs lr, lr, #1 | |
1642 beq 30f | |
1643 b 40f | |
1644 10: biweight_\w vmlal.u8, vmlal.u8 | |
1645 20: rsb r4, r4, #0 | |
1646 biweight_\w vmlal.u8, vmlsl.u8 | |
1647 30: rsb r4, r4, #0 | |
1648 rsb r5, r5, #0 | |
1649 biweight_\w vmlsl.u8, vmlsl.u8 | |
1650 40: rsb r5, r5, #0 | |
1651 biweight_\w vmlsl.u8, vmlal.u8 | |
1652 .endfunc | |
1653 .endm | |
1654 | |
1655 .macro biweight_entry w, h, b=1 | |
1656 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1657 mov ip, #\h | |
1658 .if \b | |
1659 b biweight_h264_pixels_\w\()_neon | |
1660 .endif | |
1661 .endfunc | |
1662 .endm | |
1663 | |
1664 biweight_entry 16, 8 | |
1665 biweight_entry 16, 16, b=0 | |
1666 biweight_func 16 | |
1667 | |
1668 biweight_entry 8, 16 | |
1669 biweight_entry 8, 4 | |
1670 biweight_entry 8, 8, b=0 | |
1671 biweight_func 8 | |
1672 | |
1673 biweight_entry 4, 8 | |
1674 biweight_entry 4, 2 | |
1675 biweight_entry 4, 4, b=0 | |
1676 biweight_func 4 | |
8664 | 1677 |
1678 @ Weighted prediction | |
1679 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1680 .macro weight_16 add |
8664 | 1681 vdup.8 d0, r3 |
1682 1: subs ip, ip, #2 | |
1683 vld1.8 {d20-d21},[r0,:128], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1684 vmull.u8 q2, d0, d20 |
8664 | 1685 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1686 vmull.u8 q3, d0, d21 |
8664 | 1687 vld1.8 {d28-d29},[r0,:128], r1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1688 vmull.u8 q12, d0, d28 |
8664 | 1689 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1690 vmull.u8 q13, d0, d29 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1691 \add q2, q8, q2 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1692 vrshl.s16 q2, q2, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1693 \add q3, q8, q3 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1694 vrshl.s16 q3, q3, q9 |
8664 | 1695 vqmovun.s16 d4, q2 |
1696 vqmovun.s16 d5, q3 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1697 \add q12, q8, q12 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1698 vrshl.s16 q12, q12, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1699 \add q13, q8, q13 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1700 vrshl.s16 q13, q13, q9 |
8664 | 1701 vqmovun.s16 d24, q12 |
1702 vqmovun.s16 d25, q13 | |
1703 vst1.8 {d4- d5}, [r4,:128], r1 | |
1704 vst1.8 {d24-d25},[r4,:128], r1 | |
1705 bne 1b | |
1706 pop {r4, pc} | |
1707 .endm | |
1708 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1709 .macro weight_8 add |
8664 | 1710 vdup.8 d0, r3 |
1711 1: subs ip, ip, #2 | |
1712 vld1.8 {d4},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1713 vmull.u8 q1, d0, d4 |
8664 | 1714 pld [r0] |
1715 vld1.8 {d6},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1716 vmull.u8 q10, d0, d6 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1717 \add q1, q8, q1 |
8664 | 1718 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1719 vrshl.s16 q1, q1, q9 |
8664 | 1720 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1721 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1722 vrshl.s16 q10, q10, q9 |
8664 | 1723 vqmovun.s16 d4, q10 |
1724 vst1.8 {d2},[r4,:64], r1 | |
1725 vst1.8 {d4},[r4,:64], r1 | |
1726 bne 1b | |
1727 pop {r4, pc} | |
1728 .endm | |
1729 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1730 .macro weight_4 add |
8664 | 1731 vdup.8 d0, r3 |
1732 vmov q1, q8 | |
1733 vmov q10, q8 | |
1734 1: subs ip, ip, #4 | |
1735 vld1.32 {d4[0]},[r0,:32], r1 | |
1736 vld1.32 {d4[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1737 vmull.u8 q1, d0, d4 |
8664 | 1738 pld [r0] |
1739 blt 2f | |
1740 vld1.32 {d6[0]},[r0,:32], r1 | |
1741 vld1.32 {d6[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1742 vmull.u8 q10, d0, d6 |
8664 | 1743 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1744 \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1745 vrshl.s16 q1, q1, q9 |
8664 | 1746 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1747 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1748 vrshl.s16 q10, q10, q9 |
8664 | 1749 vqmovun.s16 d4, q10 |
1750 vmov q10, q8 | |
1751 vst1.32 {d2[0]},[r4,:32], r1 | |
1752 vst1.32 {d2[1]},[r4,:32], r1 | |
1753 vmov q1, q8 | |
1754 vst1.32 {d4[0]},[r4,:32], r1 | |
1755 vst1.32 {d4[1]},[r4,:32], r1 | |
1756 bne 1b | |
1757 pop {r4, pc} | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1758 2: \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1759 vrshl.s16 q1, q1, q9 |
8664 | 1760 vqmovun.s16 d2, q1 |
1761 vst1.32 {d2[0]},[r4,:32], r1 | |
1762 vst1.32 {d2[1]},[r4,:32], r1 | |
1763 pop {r4, pc} | |
1764 .endm | |
1765 | |
1766 .macro weight_func w | |
1767 function weight_h264_pixels_\w\()_neon | |
1768 push {r4, lr} | |
1769 ldr r4, [sp, #8] | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1770 cmp r2, #1 |
8664 | 1771 lsl r4, r4, r2 |
1772 vdup.16 q8, r4 | |
1773 mov r4, r0 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1774 ble 20f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1775 rsb lr, r2, #1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1776 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1777 cmp r3, #0 |
8664 | 1778 blt 10f |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1779 weight_\w vhadd.s16 |
8664 | 1780 10: rsb r3, r3, #0 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1781 weight_\w vhsub.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1782 20: rsb lr, r2, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1783 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1784 cmp r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1785 blt 10f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1786 weight_\w vadd.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1787 10: rsb r3, r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1788 weight_\w vsub.s16 |
8664 | 1789 .endfunc |
1790 .endm | |
1791 | |
1792 .macro weight_entry w, h, b=1 | |
1793 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1794 mov ip, #\h | |
1795 .if \b | |
1796 b weight_h264_pixels_\w\()_neon | |
1797 .endif | |
1798 .endfunc | |
1799 .endm | |
1800 | |
1801 weight_entry 16, 8 | |
1802 weight_entry 16, 16, b=0 | |
1803 weight_func 16 | |
1804 | |
1805 weight_entry 8, 16 | |
1806 weight_entry 8, 4 | |
1807 weight_entry 8, 8, b=0 | |
1808 weight_func 8 | |
1809 | |
1810 weight_entry 4, 8 | |
1811 weight_entry 4, 2 | |
1812 weight_entry 4, 4, b=0 | |
1813 weight_func 4 |