Mercurial > libavcodec.hg
annotate arm/h264dsp_neon.S @ 10334:71cf44ecaa70 libavcodec
Make MS RLE decoder produce both bottom-up and top-down pictures
author | kostya |
---|---|
date | Thu, 01 Oct 2009 05:42:55 +0000 |
parents | f5ffd813dc7f |
children | be725249ea67 |
rev | line source |
---|---|
8336 | 1 /* |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
23 .fpu neon | |
24 | |
8338 | 25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 |
26 vtrn.32 \r0, \r4 | |
27 vtrn.32 \r1, \r5 | |
28 vtrn.32 \r2, \r6 | |
29 vtrn.32 \r3, \r7 | |
30 vtrn.16 \r0, \r2 | |
31 vtrn.16 \r1, \r3 | |
32 vtrn.16 \r4, \r6 | |
33 vtrn.16 \r5, \r7 | |
34 vtrn.8 \r0, \r1 | |
35 vtrn.8 \r2, \r3 | |
36 vtrn.8 \r4, \r5 | |
37 vtrn.8 \r6, \r7 | |
38 .endm | |
39 | |
9864 | 40 .macro transpose_4x4 r0 r1 r2 r3 |
41 vtrn.16 \r0, \r2 | |
42 vtrn.16 \r1, \r3 | |
43 vtrn.8 \r0, \r1 | |
44 vtrn.8 \r2, \r3 | |
45 .endm | |
46 | |
8338 | 47 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 |
48 vswp \r0, \r4 | |
49 vswp \r1, \r5 | |
50 vswp \r2, \r6 | |
51 vswp \r3, \r7 | |
52 .endm | |
53 | |
54 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 | |
55 vtrn.32 \r0, \r2 | |
56 vtrn.32 \r1, \r3 | |
57 vtrn.32 \r4, \r6 | |
58 vtrn.32 \r5, \r7 | |
59 vtrn.16 \r0, \r1 | |
60 vtrn.16 \r2, \r3 | |
61 vtrn.16 \r4, \r5 | |
62 vtrn.16 \r6, \r7 | |
63 .endm | |
64 | |
8336 | 65 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
66 .macro h264_chroma_mc8 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
67 function ff_\type\()_h264_chroma_mc8_neon, export=1 |
8336 | 68 push {r4-r7, lr} |
69 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
70 .ifc \type,avg |
8336 | 71 mov lr, r0 |
72 .endif | |
73 pld [r1] | |
74 pld [r1, r2] | |
75 | |
76 muls r7, r4, r5 | |
77 rsb r6, r7, r5, lsl #3 | |
78 rsb ip, r7, r4, lsl #3 | |
79 sub r4, r7, r4, lsl #3 | |
80 sub r4, r4, r5, lsl #3 | |
81 add r4, r4, #64 | |
82 | |
83 beq 2f | |
84 | |
85 add r5, r1, r2 | |
86 | |
87 vdup.8 d0, r4 | |
88 lsl r4, r2, #1 | |
89 vdup.8 d1, ip | |
90 vld1.64 {d4, d5}, [r1], r4 | |
91 vdup.8 d2, r6 | |
92 vld1.64 {d6, d7}, [r5], r4 | |
93 vdup.8 d3, r7 | |
94 | |
95 vext.8 d5, d4, d5, #1 | |
96 vext.8 d7, d6, d7, #1 | |
97 | |
98 1: pld [r5] | |
99 vmull.u8 q8, d4, d0 | |
100 vmlal.u8 q8, d5, d1 | |
101 vld1.64 {d4, d5}, [r1], r4 | |
102 vmlal.u8 q8, d6, d2 | |
103 vext.8 d5, d4, d5, #1 | |
104 vmlal.u8 q8, d7, d3 | |
105 vmull.u8 q9, d6, d0 | |
106 subs r3, r3, #2 | |
107 vmlal.u8 q9, d7, d1 | |
108 vmlal.u8 q9, d4, d2 | |
109 vmlal.u8 q9, d5, d3 | |
110 vrshrn.u16 d16, q8, #6 | |
111 vld1.64 {d6, d7}, [r5], r4 | |
112 pld [r1] | |
113 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
114 .ifc \type,avg |
8336 | 115 vld1.64 {d20}, [lr,:64], r2 |
116 vld1.64 {d21}, [lr,:64], r2 | |
117 vrhadd.u8 q8, q8, q10 | |
118 .endif | |
119 vext.8 d7, d6, d7, #1 | |
120 vst1.64 {d16}, [r0,:64], r2 | |
121 vst1.64 {d17}, [r0,:64], r2 | |
122 bgt 1b | |
123 | |
124 pop {r4-r7, pc} | |
125 | |
126 2: tst r6, r6 | |
127 add ip, ip, r6 | |
128 vdup.8 d0, r4 | |
129 vdup.8 d1, ip | |
130 | |
131 beq 4f | |
132 | |
133 add r5, r1, r2 | |
134 lsl r4, r2, #1 | |
135 vld1.64 {d4}, [r1], r4 | |
136 vld1.64 {d6}, [r5], r4 | |
137 | |
138 3: pld [r5] | |
139 vmull.u8 q8, d4, d0 | |
140 vmlal.u8 q8, d6, d1 | |
141 vld1.64 {d4}, [r1], r4 | |
142 vmull.u8 q9, d6, d0 | |
143 vmlal.u8 q9, d4, d1 | |
144 vld1.64 {d6}, [r5], r4 | |
145 vrshrn.u16 d16, q8, #6 | |
146 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
147 .ifc \type,avg |
8336 | 148 vld1.64 {d20}, [lr,:64], r2 |
149 vld1.64 {d21}, [lr,:64], r2 | |
150 vrhadd.u8 q8, q8, q10 | |
151 .endif | |
152 subs r3, r3, #2 | |
153 pld [r1] | |
154 vst1.64 {d16}, [r0,:64], r2 | |
155 vst1.64 {d17}, [r0,:64], r2 | |
156 bgt 3b | |
157 | |
158 pop {r4-r7, pc} | |
159 | |
160 4: vld1.64 {d4, d5}, [r1], r2 | |
161 vld1.64 {d6, d7}, [r1], r2 | |
162 vext.8 d5, d4, d5, #1 | |
163 vext.8 d7, d6, d7, #1 | |
164 | |
165 5: pld [r1] | |
166 subs r3, r3, #2 | |
167 vmull.u8 q8, d4, d0 | |
168 vmlal.u8 q8, d5, d1 | |
169 vld1.64 {d4, d5}, [r1], r2 | |
170 vmull.u8 q9, d6, d0 | |
171 vmlal.u8 q9, d7, d1 | |
172 pld [r1] | |
173 vext.8 d5, d4, d5, #1 | |
174 vrshrn.u16 d16, q8, #6 | |
175 vrshrn.u16 d17, q9, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
176 .ifc \type,avg |
8336 | 177 vld1.64 {d20}, [lr,:64], r2 |
178 vld1.64 {d21}, [lr,:64], r2 | |
179 vrhadd.u8 q8, q8, q10 | |
180 .endif | |
181 vld1.64 {d6, d7}, [r1], r2 | |
182 vext.8 d7, d6, d7, #1 | |
183 vst1.64 {d16}, [r0,:64], r2 | |
184 vst1.64 {d17}, [r0,:64], r2 | |
185 bgt 5b | |
186 | |
187 pop {r4-r7, pc} | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
188 .endfunc |
8336 | 189 .endm |
190 | |
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
192 .macro h264_chroma_mc4 type |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
193 function ff_\type\()_h264_chroma_mc4_neon, export=1 |
8336 | 194 push {r4-r7, lr} |
195 ldrd r4, [sp, #20] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
196 .ifc \type,avg |
8336 | 197 mov lr, r0 |
198 .endif | |
199 pld [r1] | |
200 pld [r1, r2] | |
201 | |
202 muls r7, r4, r5 | |
203 rsb r6, r7, r5, lsl #3 | |
204 rsb ip, r7, r4, lsl #3 | |
205 sub r4, r7, r4, lsl #3 | |
206 sub r4, r4, r5, lsl #3 | |
207 add r4, r4, #64 | |
208 | |
209 beq 2f | |
210 | |
211 add r5, r1, r2 | |
212 | |
213 vdup.8 d0, r4 | |
214 lsl r4, r2, #1 | |
215 vdup.8 d1, ip | |
216 vld1.64 {d4}, [r1], r4 | |
217 vdup.8 d2, r6 | |
218 vld1.64 {d6}, [r5], r4 | |
219 vdup.8 d3, r7 | |
220 | |
221 vext.8 d5, d4, d5, #1 | |
222 vext.8 d7, d6, d7, #1 | |
223 vtrn.32 d4, d5 | |
224 vtrn.32 d6, d7 | |
225 | |
226 vtrn.32 d0, d1 | |
227 vtrn.32 d2, d3 | |
228 | |
229 1: pld [r5] | |
230 vmull.u8 q8, d4, d0 | |
231 vmlal.u8 q8, d6, d2 | |
232 vld1.64 {d4}, [r1], r4 | |
233 vext.8 d5, d4, d5, #1 | |
234 vtrn.32 d4, d5 | |
235 vmull.u8 q9, d6, d0 | |
236 vmlal.u8 q9, d4, d2 | |
237 vld1.64 {d6}, [r5], r4 | |
238 vadd.i16 d16, d16, d17 | |
239 vadd.i16 d17, d18, d19 | |
240 vrshrn.u16 d16, q8, #6 | |
241 subs r3, r3, #2 | |
242 pld [r1] | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
243 .ifc \type,avg |
8336 | 244 vld1.32 {d20[0]}, [lr,:32], r2 |
245 vld1.32 {d20[1]}, [lr,:32], r2 | |
246 vrhadd.u8 d16, d16, d20 | |
247 .endif | |
248 vext.8 d7, d6, d7, #1 | |
249 vtrn.32 d6, d7 | |
250 vst1.32 {d16[0]}, [r0,:32], r2 | |
251 vst1.32 {d16[1]}, [r0,:32], r2 | |
252 bgt 1b | |
253 | |
254 pop {r4-r7, pc} | |
255 | |
256 2: tst r6, r6 | |
257 add ip, ip, r6 | |
258 vdup.8 d0, r4 | |
259 vdup.8 d1, ip | |
260 vtrn.32 d0, d1 | |
261 | |
262 beq 4f | |
263 | |
264 vext.32 d1, d0, d1, #1 | |
265 add r5, r1, r2 | |
266 lsl r4, r2, #1 | |
267 vld1.32 {d4[0]}, [r1], r4 | |
268 vld1.32 {d4[1]}, [r5], r4 | |
269 | |
270 3: pld [r5] | |
271 vmull.u8 q8, d4, d0 | |
272 vld1.32 {d4[0]}, [r1], r4 | |
273 vmull.u8 q9, d4, d1 | |
274 vld1.32 {d4[1]}, [r5], r4 | |
275 vadd.i16 d16, d16, d17 | |
276 vadd.i16 d17, d18, d19 | |
277 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
278 .ifc \type,avg |
8336 | 279 vld1.32 {d20[0]}, [lr,:32], r2 |
280 vld1.32 {d20[1]}, [lr,:32], r2 | |
281 vrhadd.u8 d16, d16, d20 | |
282 .endif | |
283 subs r3, r3, #2 | |
284 pld [r1] | |
285 vst1.32 {d16[0]}, [r0,:32], r2 | |
286 vst1.32 {d16[1]}, [r0,:32], r2 | |
287 bgt 3b | |
288 | |
289 pop {r4-r7, pc} | |
290 | |
291 4: vld1.64 {d4}, [r1], r2 | |
292 vld1.64 {d6}, [r1], r2 | |
293 vext.8 d5, d4, d5, #1 | |
294 vext.8 d7, d6, d7, #1 | |
295 vtrn.32 d4, d5 | |
296 vtrn.32 d6, d7 | |
297 | |
298 5: vmull.u8 q8, d4, d0 | |
299 vmull.u8 q9, d6, d0 | |
300 subs r3, r3, #2 | |
301 vld1.64 {d4}, [r1], r2 | |
302 vext.8 d5, d4, d5, #1 | |
303 vtrn.32 d4, d5 | |
304 vadd.i16 d16, d16, d17 | |
305 vadd.i16 d17, d18, d19 | |
306 pld [r1] | |
307 vrshrn.u16 d16, q8, #6 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
308 .ifc \type,avg |
8336 | 309 vld1.32 {d20[0]}, [lr,:32], r2 |
310 vld1.32 {d20[1]}, [lr,:32], r2 | |
311 vrhadd.u8 d16, d16, d20 | |
312 .endif | |
313 vld1.64 {d6}, [r1], r2 | |
314 vext.8 d7, d6, d7, #1 | |
315 vtrn.32 d6, d7 | |
316 pld [r1] | |
317 vst1.32 {d16[0]}, [r0,:32], r2 | |
318 vst1.32 {d16[1]}, [r0,:32], r2 | |
319 bgt 5b | |
320 | |
321 pop {r4-r7, pc} | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
322 .endfunc |
8336 | 323 .endm |
324 | |
325 .text | |
326 .align | |
327 | |
8626
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
328 h264_chroma_mc8 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
329 h264_chroma_mc8 avg |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
330 h264_chroma_mc4 put |
8d425ee85ddb
ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
mru
parents:
8359
diff
changeset
|
331 h264_chroma_mc4 avg |
8337 | 332 |
333 /* H.264 loop filter */ | |
334 | |
335 .macro h264_loop_filter_start | |
336 ldr ip, [sp] | |
337 tst r2, r2 | |
338 ldr ip, [ip] | |
339 tstne r3, r3 | |
340 vmov.32 d24[0], ip | |
341 and ip, ip, ip, lsl #16 | |
342 bxeq lr | |
343 ands ip, ip, ip, lsl #8 | |
344 bxlt lr | |
345 .endm | |
346 | |
347 .macro align_push_regs | |
348 and ip, sp, #15 | |
349 add ip, ip, #32 | |
350 sub sp, sp, ip | |
351 vst1.64 {d12-d15}, [sp,:128] | |
352 sub sp, sp, #32 | |
353 vst1.64 {d8-d11}, [sp,:128] | |
354 .endm | |
355 | |
356 .macro align_pop_regs | |
357 vld1.64 {d8-d11}, [sp,:128]! | |
358 vld1.64 {d12-d15}, [sp,:128], ip | |
359 .endm | |
360 | |
361 .macro h264_loop_filter_luma | |
362 vdup.8 q11, r2 @ alpha | |
363 vmovl.u8 q12, d24 | |
364 vabd.u8 q6, q8, q0 @ abs(p0 - q0) | |
365 vmovl.u16 q12, d24 | |
366 vabd.u8 q14, q9, q8 @ abs(p1 - p0) | |
367 vsli.16 q12, q12, #8 | |
368 vabd.u8 q15, q1, q0 @ abs(q1 - q0) | |
369 vsli.32 q12, q12, #16 | |
370 vclt.u8 q6, q6, q11 @ < alpha | |
371 vdup.8 q11, r3 @ beta | |
372 vclt.s8 q7, q12, #0 | |
373 vclt.u8 q14, q14, q11 @ < beta | |
374 vclt.u8 q15, q15, q11 @ < beta | |
375 vbic q6, q6, q7 | |
376 vabd.u8 q4, q10, q8 @ abs(p2 - p0) | |
377 vand q6, q6, q14 | |
378 vabd.u8 q5, q2, q0 @ abs(q2 - q0) | |
379 vclt.u8 q4, q4, q11 @ < beta | |
380 vand q6, q6, q15 | |
381 vclt.u8 q5, q5, q11 @ < beta | |
382 vand q4, q4, q6 | |
383 vand q5, q5, q6 | |
384 vand q12, q12, q6 | |
385 vrhadd.u8 q14, q8, q0 | |
386 vsub.i8 q6, q12, q4 | |
387 vqadd.u8 q7, q9, q12 | |
388 vhadd.u8 q10, q10, q14 | |
389 vsub.i8 q6, q6, q5 | |
390 vhadd.u8 q14, q2, q14 | |
391 vmin.u8 q7, q7, q10 | |
392 vqsub.u8 q11, q9, q12 | |
393 vqadd.u8 q2, q1, q12 | |
394 vmax.u8 q7, q7, q11 | |
395 vqsub.u8 q11, q1, q12 | |
396 vmin.u8 q14, q2, q14 | |
397 vmovl.u8 q2, d0 | |
398 vmax.u8 q14, q14, q11 | |
399 vmovl.u8 q10, d1 | |
400 vsubw.u8 q2, q2, d16 | |
401 vsubw.u8 q10, q10, d17 | |
402 vshl.i16 q2, q2, #2 | |
403 vshl.i16 q10, q10, #2 | |
404 vaddw.u8 q2, q2, d18 | |
405 vaddw.u8 q10, q10, d19 | |
406 vsubw.u8 q2, q2, d2 | |
407 vsubw.u8 q10, q10, d3 | |
408 vrshrn.i16 d4, q2, #3 | |
409 vrshrn.i16 d5, q10, #3 | |
410 vbsl q4, q7, q9 | |
411 vbsl q5, q14, q1 | |
412 vneg.s8 q7, q6 | |
413 vmovl.u8 q14, d16 | |
414 vmin.s8 q2, q2, q6 | |
415 vmovl.u8 q6, d17 | |
416 vmax.s8 q2, q2, q7 | |
417 vmovl.u8 q11, d0 | |
418 vmovl.u8 q12, d1 | |
419 vaddw.s8 q14, q14, d4 | |
420 vaddw.s8 q6, q6, d5 | |
421 vsubw.s8 q11, q11, d4 | |
422 vsubw.s8 q12, q12, d5 | |
423 vqmovun.s16 d16, q14 | |
424 vqmovun.s16 d17, q6 | |
425 vqmovun.s16 d0, q11 | |
426 vqmovun.s16 d1, q12 | |
427 .endm | |
428 | |
429 function ff_h264_v_loop_filter_luma_neon, export=1 | |
430 h264_loop_filter_start | |
431 | |
432 vld1.64 {d0, d1}, [r0,:128], r1 | |
433 vld1.64 {d2, d3}, [r0,:128], r1 | |
434 vld1.64 {d4, d5}, [r0,:128], r1 | |
435 sub r0, r0, r1, lsl #2 | |
436 sub r0, r0, r1, lsl #1 | |
437 vld1.64 {d20,d21}, [r0,:128], r1 | |
438 vld1.64 {d18,d19}, [r0,:128], r1 | |
439 vld1.64 {d16,d17}, [r0,:128], r1 | |
440 | |
441 align_push_regs | |
442 | |
443 h264_loop_filter_luma | |
444 | |
445 sub r0, r0, r1, lsl #1 | |
446 vst1.64 {d8, d9}, [r0,:128], r1 | |
447 vst1.64 {d16,d17}, [r0,:128], r1 | |
448 vst1.64 {d0, d1}, [r0,:128], r1 | |
449 vst1.64 {d10,d11}, [r0,:128] | |
450 | |
451 align_pop_regs | |
452 bx lr | |
453 .endfunc | |
454 | |
455 function ff_h264_h_loop_filter_luma_neon, export=1 | |
456 h264_loop_filter_start | |
457 | |
458 sub r0, r0, #4 | |
459 vld1.64 {d6}, [r0], r1 | |
460 vld1.64 {d20}, [r0], r1 | |
461 vld1.64 {d18}, [r0], r1 | |
462 vld1.64 {d16}, [r0], r1 | |
463 vld1.64 {d0}, [r0], r1 | |
464 vld1.64 {d2}, [r0], r1 | |
465 vld1.64 {d4}, [r0], r1 | |
466 vld1.64 {d26}, [r0], r1 | |
467 vld1.64 {d7}, [r0], r1 | |
468 vld1.64 {d21}, [r0], r1 | |
469 vld1.64 {d19}, [r0], r1 | |
470 vld1.64 {d17}, [r0], r1 | |
471 vld1.64 {d1}, [r0], r1 | |
472 vld1.64 {d3}, [r0], r1 | |
473 vld1.64 {d5}, [r0], r1 | |
474 vld1.64 {d27}, [r0], r1 | |
475 | |
8338 | 476 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 |
8337 | 477 |
478 align_push_regs | |
479 | |
480 h264_loop_filter_luma | |
481 | |
9864 | 482 transpose_4x4 q4, q8, q0, q5 |
8337 | 483 |
484 sub r0, r0, r1, lsl #4 | |
9864 | 485 add r0, r0, #2 |
486 vst1.32 {d8[0]}, [r0], r1 | |
487 vst1.32 {d16[0]}, [r0], r1 | |
488 vst1.32 {d0[0]}, [r0], r1 | |
489 vst1.32 {d10[0]}, [r0], r1 | |
490 vst1.32 {d8[1]}, [r0], r1 | |
491 vst1.32 {d16[1]}, [r0], r1 | |
492 vst1.32 {d0[1]}, [r0], r1 | |
493 vst1.32 {d10[1]}, [r0], r1 | |
494 vst1.32 {d9[0]}, [r0], r1 | |
495 vst1.32 {d17[0]}, [r0], r1 | |
496 vst1.32 {d1[0]}, [r0], r1 | |
497 vst1.32 {d11[0]}, [r0], r1 | |
498 vst1.32 {d9[1]}, [r0], r1 | |
499 vst1.32 {d17[1]}, [r0], r1 | |
500 vst1.32 {d1[1]}, [r0], r1 | |
501 vst1.32 {d11[1]}, [r0], r1 | |
8337 | 502 |
503 align_pop_regs | |
504 bx lr | |
505 .endfunc | |
506 | |
507 .macro h264_loop_filter_chroma | |
508 vdup.8 d22, r2 @ alpha | |
509 vmovl.u8 q12, d24 | |
510 vabd.u8 d26, d16, d0 @ abs(p0 - q0) | |
511 vmovl.u8 q2, d0 | |
512 vabd.u8 d28, d18, d16 @ abs(p1 - p0) | |
513 vsubw.u8 q2, q2, d16 | |
514 vsli.16 d24, d24, #8 | |
515 vshl.i16 q2, q2, #2 | |
516 vabd.u8 d30, d2, d0 @ abs(q1 - q0) | |
517 vaddw.u8 q2, q2, d18 | |
518 vclt.u8 d26, d26, d22 @ < alpha | |
519 vsubw.u8 q2, q2, d2 | |
520 vdup.8 d22, r3 @ beta | |
521 vclt.s8 d25, d24, #0 | |
522 vrshrn.i16 d4, q2, #3 | |
523 vclt.u8 d28, d28, d22 @ < beta | |
524 vbic d26, d26, d25 | |
525 vclt.u8 d30, d30, d22 @ < beta | |
526 vand d26, d26, d28 | |
527 vneg.s8 d25, d24 | |
528 vand d26, d26, d30 | |
529 vmin.s8 d4, d4, d24 | |
530 vmovl.u8 q14, d16 | |
531 vand d4, d4, d26 | |
532 vmax.s8 d4, d4, d25 | |
533 vmovl.u8 q11, d0 | |
534 vaddw.s8 q14, q14, d4 | |
535 vsubw.s8 q11, q11, d4 | |
536 vqmovun.s16 d16, q14 | |
537 vqmovun.s16 d0, q11 | |
538 .endm | |
539 | |
540 function ff_h264_v_loop_filter_chroma_neon, export=1 | |
541 h264_loop_filter_start | |
542 | |
543 sub r0, r0, r1, lsl #1 | |
544 vld1.64 {d18}, [r0,:64], r1 | |
545 vld1.64 {d16}, [r0,:64], r1 | |
546 vld1.64 {d0}, [r0,:64], r1 | |
547 vld1.64 {d2}, [r0,:64] | |
548 | |
549 h264_loop_filter_chroma | |
550 | |
551 sub r0, r0, r1, lsl #1 | |
552 vst1.64 {d16}, [r0,:64], r1 | |
553 vst1.64 {d0}, [r0,:64], r1 | |
554 | |
555 bx lr | |
556 .endfunc | |
557 | |
558 function ff_h264_h_loop_filter_chroma_neon, export=1 | |
559 h264_loop_filter_start | |
560 | |
561 sub r0, r0, #2 | |
562 vld1.32 {d18[0]}, [r0], r1 | |
563 vld1.32 {d16[0]}, [r0], r1 | |
564 vld1.32 {d0[0]}, [r0], r1 | |
565 vld1.32 {d2[0]}, [r0], r1 | |
566 vld1.32 {d18[1]}, [r0], r1 | |
567 vld1.32 {d16[1]}, [r0], r1 | |
568 vld1.32 {d0[1]}, [r0], r1 | |
569 vld1.32 {d2[1]}, [r0], r1 | |
570 | |
571 vtrn.16 d18, d0 | |
572 vtrn.16 d16, d2 | |
573 vtrn.8 d18, d16 | |
574 vtrn.8 d0, d2 | |
575 | |
576 h264_loop_filter_chroma | |
577 | |
578 vtrn.16 d18, d0 | |
579 vtrn.16 d16, d2 | |
580 vtrn.8 d18, d16 | |
581 vtrn.8 d0, d2 | |
582 | |
583 sub r0, r0, r1, lsl #3 | |
584 vst1.32 {d18[0]}, [r0], r1 | |
585 vst1.32 {d16[0]}, [r0], r1 | |
586 vst1.32 {d0[0]}, [r0], r1 | |
587 vst1.32 {d2[0]}, [r0], r1 | |
588 vst1.32 {d18[1]}, [r0], r1 | |
589 vst1.32 {d16[1]}, [r0], r1 | |
590 vst1.32 {d0[1]}, [r0], r1 | |
591 vst1.32 {d2[1]}, [r0], r1 | |
592 | |
593 bx lr | |
594 .endfunc | |
8338 | 595 |
596 /* H.264 qpel MC */ | |
597 | |
598 .macro lowpass_const r | |
599 movw \r, #5 | |
600 movt \r, #20 | |
601 vmov.32 d6[0], \r | |
602 .endm | |
603 | |
604 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
605 .if \narrow | |
606 t0 .req q0 | |
607 t1 .req q8 | |
608 .else | |
609 t0 .req \d0 | |
610 t1 .req \d1 | |
611 .endif | |
612 vext.8 d2, \r0, \r1, #2 | |
613 vext.8 d3, \r0, \r1, #3 | |
614 vaddl.u8 q1, d2, d3 | |
615 vext.8 d4, \r0, \r1, #1 | |
616 vext.8 d5, \r0, \r1, #4 | |
617 vaddl.u8 q2, d4, d5 | |
618 vext.8 d30, \r0, \r1, #5 | |
619 vaddl.u8 t0, \r0, d30 | |
620 vext.8 d18, \r2, \r3, #2 | |
621 vmla.i16 t0, q1, d6[1] | |
622 vext.8 d19, \r2, \r3, #3 | |
623 vaddl.u8 q9, d18, d19 | |
624 vext.8 d20, \r2, \r3, #1 | |
625 vmls.i16 t0, q2, d6[0] | |
626 vext.8 d21, \r2, \r3, #4 | |
627 vaddl.u8 q10, d20, d21 | |
628 vext.8 d31, \r2, \r3, #5 | |
629 vaddl.u8 t1, \r2, d31 | |
630 vmla.i16 t1, q9, d6[1] | |
631 vmls.i16 t1, q10, d6[0] | |
632 .if \narrow | |
633 vqrshrun.s16 \d0, t0, #5 | |
634 vqrshrun.s16 \d1, t1, #5 | |
635 .endif | |
636 .unreq t0 | |
637 .unreq t1 | |
638 .endm | |
639 | |
640 .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
641 .if \narrow | |
642 t0 .req q0 | |
643 .else | |
644 t0 .req \d0 | |
645 .endif | |
646 vext.8 d2, \r0, \r1, #2 | |
647 vext.8 d3, \r0, \r1, #3 | |
648 vaddl.u8 q1, d2, d3 | |
649 vext.8 d4, \r0, \r1, #1 | |
650 vext.8 d5, \r0, \r1, #4 | |
651 vaddl.u8 q2, d4, d5 | |
652 vext.8 d30, \r0, \r1, #5 | |
653 vaddl.u8 t0, \r0, d30 | |
654 vmla.i16 t0, q1, d6[1] | |
655 vmls.i16 t0, q2, d6[0] | |
656 .if \narrow | |
657 vqrshrun.s16 \d0, t0, #5 | |
658 .endif | |
659 .unreq t0 | |
660 .endm | |
661 | |
662 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
663 vext.16 q1, \r0, \r1, #2 | |
664 vext.16 q0, \r0, \r1, #3 | |
665 vaddl.s16 q9, d2, d0 | |
666 vext.16 q2, \r0, \r1, #1 | |
667 vaddl.s16 q1, d3, d1 | |
668 vext.16 q3, \r0, \r1, #4 | |
669 vaddl.s16 q10, d4, d6 | |
670 vext.16 \r1, \r0, \r1, #5 | |
671 vaddl.s16 q2, d5, d7 | |
672 vaddl.s16 q0, \h0, \h1 | |
673 vaddl.s16 q8, \l0, \l1 | |
674 | |
675 vshl.i32 q3, q9, #4 | |
676 vshl.i32 q9, q9, #2 | |
677 vshl.i32 q15, q10, #2 | |
678 vadd.i32 q9, q9, q3 | |
679 vadd.i32 q10, q10, q15 | |
680 | |
681 vshl.i32 q3, q1, #4 | |
682 vshl.i32 q1, q1, #2 | |
683 vshl.i32 q15, q2, #2 | |
684 vadd.i32 q1, q1, q3 | |
685 vadd.i32 q2, q2, q15 | |
686 | |
687 vadd.i32 q9, q9, q8 | |
688 vsub.i32 q9, q9, q10 | |
689 | |
690 vadd.i32 q1, q1, q0 | |
691 vsub.i32 q1, q1, q2 | |
692 | |
693 vrshrn.s32 d18, q9, #10 | |
694 vrshrn.s32 d19, q1, #10 | |
695 | |
696 vqmovun.s16 \d, q9 | |
697 .endm | |
698 | |
699 function put_h264_qpel16_h_lowpass_neon_packed | |
700 mov r4, lr | |
701 mov ip, #16 | |
702 mov r3, #8 | |
703 bl put_h264_qpel8_h_lowpass_neon | |
704 sub r1, r1, r2, lsl #4 | |
705 add r1, r1, #8 | |
706 mov ip, #16 | |
707 mov lr, r4 | |
708 b put_h264_qpel8_h_lowpass_neon | |
709 .endfunc | |
710 | |
711 function put_h264_qpel16_h_lowpass_neon | |
712 push {lr} | |
713 mov ip, #16 | |
714 bl put_h264_qpel8_h_lowpass_neon | |
715 sub r0, r0, r3, lsl #4 | |
716 sub r1, r1, r2, lsl #4 | |
717 add r0, r0, #8 | |
718 add r1, r1, #8 | |
719 mov ip, #16 | |
720 pop {lr} | |
721 .endfunc | |
722 | |
723 function put_h264_qpel8_h_lowpass_neon | |
724 1: vld1.64 {d0, d1}, [r1], r2 | |
725 vld1.64 {d16,d17}, [r1], r2 | |
726 subs ip, ip, #2 | |
727 lowpass_8 d0, d1, d16, d17, d0, d16 | |
728 vst1.64 {d0}, [r0,:64], r3 | |
729 vst1.64 {d16}, [r0,:64], r3 | |
730 bne 1b | |
731 bx lr | |
732 .endfunc | |
733 | |
734 function put_h264_qpel16_h_lowpass_l2_neon | |
735 push {lr} | |
736 mov ip, #16 | |
737 bl put_h264_qpel8_h_lowpass_l2_neon | |
738 sub r0, r0, r2, lsl #4 | |
739 sub r1, r1, r2, lsl #4 | |
740 sub r3, r3, r2, lsl #4 | |
741 add r0, r0, #8 | |
742 add r1, r1, #8 | |
743 add r3, r3, #8 | |
744 mov ip, #16 | |
745 pop {lr} | |
746 .endfunc | |
747 | |
748 function put_h264_qpel8_h_lowpass_l2_neon | |
749 1: vld1.64 {d0, d1}, [r1], r2 | |
750 vld1.64 {d16,d17}, [r1], r2 | |
751 vld1.64 {d28}, [r3], r2 | |
752 vld1.64 {d29}, [r3], r2 | |
753 subs ip, ip, #2 | |
754 lowpass_8 d0, d1, d16, d17, d0, d1 | |
755 vrhadd.u8 q0, q0, q14 | |
756 vst1.64 {d0}, [r0,:64], r2 | |
757 vst1.64 {d1}, [r0,:64], r2 | |
758 bne 1b | |
759 bx lr | |
760 .endfunc | |
761 | |
762 function put_h264_qpel16_v_lowpass_neon_packed | |
763 mov r4, lr | |
764 mov r2, #8 | |
765 bl put_h264_qpel8_v_lowpass_neon | |
766 sub r1, r1, r3, lsl #2 | |
767 bl put_h264_qpel8_v_lowpass_neon | |
768 sub r1, r1, r3, lsl #4 | |
769 sub r1, r1, r3, lsl #2 | |
770 add r1, r1, #8 | |
771 bl put_h264_qpel8_v_lowpass_neon | |
772 sub r1, r1, r3, lsl #2 | |
773 mov lr, r4 | |
774 b put_h264_qpel8_v_lowpass_neon | |
775 .endfunc | |
776 | |
777 function put_h264_qpel16_v_lowpass_neon | |
778 mov r4, lr | |
779 bl put_h264_qpel8_v_lowpass_neon | |
780 sub r1, r1, r3, lsl #2 | |
781 bl put_h264_qpel8_v_lowpass_neon | |
782 sub r0, r0, r2, lsl #4 | |
783 add r0, r0, #8 | |
784 sub r1, r1, r3, lsl #4 | |
785 sub r1, r1, r3, lsl #2 | |
786 add r1, r1, #8 | |
787 bl put_h264_qpel8_v_lowpass_neon | |
788 sub r1, r1, r3, lsl #2 | |
789 mov lr, r4 | |
790 .endfunc | |
791 | |
792 function put_h264_qpel8_v_lowpass_neon | |
793 vld1.64 {d8}, [r1], r3 | |
794 vld1.64 {d10}, [r1], r3 | |
795 vld1.64 {d12}, [r1], r3 | |
796 vld1.64 {d14}, [r1], r3 | |
797 vld1.64 {d22}, [r1], r3 | |
798 vld1.64 {d24}, [r1], r3 | |
799 vld1.64 {d26}, [r1], r3 | |
800 vld1.64 {d28}, [r1], r3 | |
801 vld1.64 {d9}, [r1], r3 | |
802 vld1.64 {d11}, [r1], r3 | |
803 vld1.64 {d13}, [r1], r3 | |
804 vld1.64 {d15}, [r1], r3 | |
805 vld1.64 {d23}, [r1] | |
806 | |
807 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
808 lowpass_8 d8, d9, d10, d11, d8, d10 | |
809 lowpass_8 d12, d13, d14, d15, d12, d14 | |
810 lowpass_8 d22, d23, d24, d25, d22, d24 | |
811 lowpass_8 d26, d27, d28, d29, d26, d28 | |
812 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
813 | |
814 vst1.64 {d8}, [r0,:64], r2 | |
815 vst1.64 {d10}, [r0,:64], r2 | |
816 vst1.64 {d12}, [r0,:64], r2 | |
817 vst1.64 {d14}, [r0,:64], r2 | |
818 vst1.64 {d22}, [r0,:64], r2 | |
819 vst1.64 {d24}, [r0,:64], r2 | |
820 vst1.64 {d26}, [r0,:64], r2 | |
821 vst1.64 {d28}, [r0,:64], r2 | |
822 | |
823 bx lr | |
824 .endfunc | |
825 | |
826 function put_h264_qpel16_v_lowpass_l2_neon | |
827 mov r4, lr | |
828 bl put_h264_qpel8_v_lowpass_l2_neon | |
829 sub r1, r1, r3, lsl #2 | |
830 bl put_h264_qpel8_v_lowpass_l2_neon | |
831 sub r0, r0, r3, lsl #4 | |
832 sub ip, ip, r2, lsl #4 | |
833 add r0, r0, #8 | |
834 add ip, ip, #8 | |
835 sub r1, r1, r3, lsl #4 | |
836 sub r1, r1, r3, lsl #2 | |
837 add r1, r1, #8 | |
838 bl put_h264_qpel8_v_lowpass_l2_neon | |
839 sub r1, r1, r3, lsl #2 | |
840 mov lr, r4 | |
841 .endfunc | |
842 | |
843 function put_h264_qpel8_v_lowpass_l2_neon | |
844 vld1.64 {d8}, [r1], r3 | |
845 vld1.64 {d10}, [r1], r3 | |
846 vld1.64 {d12}, [r1], r3 | |
847 vld1.64 {d14}, [r1], r3 | |
848 vld1.64 {d22}, [r1], r3 | |
849 vld1.64 {d24}, [r1], r3 | |
850 vld1.64 {d26}, [r1], r3 | |
851 vld1.64 {d28}, [r1], r3 | |
852 vld1.64 {d9}, [r1], r3 | |
853 vld1.64 {d11}, [r1], r3 | |
854 vld1.64 {d13}, [r1], r3 | |
855 vld1.64 {d15}, [r1], r3 | |
856 vld1.64 {d23}, [r1] | |
857 | |
858 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
859 lowpass_8 d8, d9, d10, d11, d8, d9 | |
860 lowpass_8 d12, d13, d14, d15, d12, d13 | |
861 lowpass_8 d22, d23, d24, d25, d22, d23 | |
862 lowpass_8 d26, d27, d28, d29, d26, d27 | |
863 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
864 | |
865 vld1.64 {d0}, [ip], r2 | |
866 vld1.64 {d1}, [ip], r2 | |
867 vld1.64 {d2}, [ip], r2 | |
868 vld1.64 {d3}, [ip], r2 | |
869 vld1.64 {d4}, [ip], r2 | |
870 vrhadd.u8 q0, q0, q4 | |
871 vld1.64 {d5}, [ip], r2 | |
872 vrhadd.u8 q1, q1, q6 | |
873 vld1.64 {d10}, [ip], r2 | |
874 vrhadd.u8 q2, q2, q11 | |
875 vld1.64 {d11}, [ip], r2 | |
876 | |
877 vst1.64 {d0}, [r0,:64], r3 | |
878 vst1.64 {d1}, [r0,:64], r3 | |
879 vrhadd.u8 q5, q5, q13 | |
880 vst1.64 {d2}, [r0,:64], r3 | |
881 vst1.64 {d3}, [r0,:64], r3 | |
882 vst1.64 {d4}, [r0,:64], r3 | |
883 vst1.64 {d5}, [r0,:64], r3 | |
884 vst1.64 {d10}, [r0,:64], r3 | |
885 vst1.64 {d11}, [r0,:64], r3 | |
886 | |
887 bx lr | |
888 .endfunc | |
889 | |
890 function put_h264_qpel8_hv_lowpass_neon_top | |
891 lowpass_const ip | |
892 mov ip, #12 | |
893 1: vld1.64 {d0, d1}, [r1], r3 | |
894 vld1.64 {d16,d17}, [r1], r3 | |
895 subs ip, ip, #2 | |
896 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
897 vst1.64 {d22-d25}, [r4,:128]! | |
898 bne 1b | |
899 | |
900 vld1.64 {d0, d1}, [r1] | |
901 lowpass_8_1 d0, d1, q12, narrow=0 | |
902 | |
903 mov ip, #-16 | |
904 add r4, r4, ip | |
905 vld1.64 {d30,d31}, [r4,:128], ip | |
906 vld1.64 {d20,d21}, [r4,:128], ip | |
907 vld1.64 {d18,d19}, [r4,:128], ip | |
908 vld1.64 {d16,d17}, [r4,:128], ip | |
909 vld1.64 {d14,d15}, [r4,:128], ip | |
910 vld1.64 {d12,d13}, [r4,:128], ip | |
911 vld1.64 {d10,d11}, [r4,:128], ip | |
912 vld1.64 {d8, d9}, [r4,:128], ip | |
913 vld1.64 {d6, d7}, [r4,:128], ip | |
914 vld1.64 {d4, d5}, [r4,:128], ip | |
915 vld1.64 {d2, d3}, [r4,:128], ip | |
916 vld1.64 {d0, d1}, [r4,:128] | |
917 | |
918 swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
919 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
920 | |
921 swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
922 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
923 | |
924 vst1.64 {d30,d31}, [r4,:128]! | |
925 vst1.64 {d6, d7}, [r4,:128]! | |
926 vst1.64 {d20,d21}, [r4,:128]! | |
927 vst1.64 {d4, d5}, [r4,:128]! | |
928 vst1.64 {d18,d19}, [r4,:128]! | |
929 vst1.64 {d2, d3}, [r4,:128]! | |
930 vst1.64 {d16,d17}, [r4,:128]! | |
931 vst1.64 {d0, d1}, [r4,:128] | |
932 | |
933 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
934 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
935 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
936 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
937 | |
938 vld1.64 {d16,d17}, [r4,:128], ip | |
939 vld1.64 {d30,d31}, [r4,:128], ip | |
940 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
941 vld1.64 {d16,d17}, [r4,:128], ip | |
942 vld1.64 {d30,d31}, [r4,:128], ip | |
943 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
944 vld1.64 {d16,d17}, [r4,:128], ip | |
945 vld1.64 {d30,d31}, [r4,:128], ip | |
946 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
947 vld1.64 {d16,d17}, [r4,:128], ip | |
948 vld1.64 {d30,d31}, [r4,:128] | |
949 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
950 | |
951 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
952 | |
953 bx lr | |
954 .endfunc | |
955 | |
956 function put_h264_qpel8_hv_lowpass_neon | |
957 mov r10, lr | |
958 bl put_h264_qpel8_hv_lowpass_neon_top | |
959 vst1.64 {d12}, [r0,:64], r2 | |
960 vst1.64 {d13}, [r0,:64], r2 | |
961 vst1.64 {d14}, [r0,:64], r2 | |
962 vst1.64 {d15}, [r0,:64], r2 | |
963 vst1.64 {d8}, [r0,:64], r2 | |
964 vst1.64 {d9}, [r0,:64], r2 | |
965 vst1.64 {d10}, [r0,:64], r2 | |
966 vst1.64 {d11}, [r0,:64], r2 | |
967 | |
968 mov lr, r10 | |
969 bx lr | |
970 .endfunc | |
971 | |
972 function put_h264_qpel8_hv_lowpass_l2_neon | |
973 mov r10, lr | |
974 bl put_h264_qpel8_hv_lowpass_neon_top | |
975 | |
976 vld1.64 {d0, d1}, [r2,:128]! | |
977 vld1.64 {d2, d3}, [r2,:128]! | |
978 vrhadd.u8 q0, q0, q6 | |
979 vld1.64 {d4, d5}, [r2,:128]! | |
980 vrhadd.u8 q1, q1, q7 | |
981 vld1.64 {d6, d7}, [r2,:128]! | |
982 vrhadd.u8 q2, q2, q4 | |
983 | |
984 vst1.64 {d0}, [r0,:64], r3 | |
985 vrhadd.u8 q3, q3, q5 | |
986 vst1.64 {d1}, [r0,:64], r3 | |
987 vst1.64 {d2}, [r0,:64], r3 | |
988 vst1.64 {d3}, [r0,:64], r3 | |
989 vst1.64 {d4}, [r0,:64], r3 | |
990 vst1.64 {d5}, [r0,:64], r3 | |
991 vst1.64 {d6}, [r0,:64], r3 | |
992 vst1.64 {d7}, [r0,:64], r3 | |
993 | |
994 mov lr, r10 | |
995 bx lr | |
996 .endfunc | |
997 | |
998 function put_h264_qpel16_hv_lowpass_neon | |
999 mov r9, lr | |
1000 bl put_h264_qpel8_hv_lowpass_neon | |
1001 sub r1, r1, r3, lsl #2 | |
1002 bl put_h264_qpel8_hv_lowpass_neon | |
1003 sub r1, r1, r3, lsl #4 | |
1004 sub r1, r1, r3, lsl #2 | |
1005 add r1, r1, #8 | |
1006 sub r0, r0, r2, lsl #4 | |
1007 add r0, r0, #8 | |
1008 bl put_h264_qpel8_hv_lowpass_neon | |
1009 sub r1, r1, r3, lsl #2 | |
1010 mov lr, r9 | |
1011 b put_h264_qpel8_hv_lowpass_neon | |
1012 .endfunc | |
1013 | |
1014 function put_h264_qpel16_hv_lowpass_l2_neon | |
1015 mov r9, lr | |
1016 sub r2, r4, #256 | |
1017 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1018 sub r1, r1, r3, lsl #2 | |
1019 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1020 sub r1, r1, r3, lsl #4 | |
1021 sub r1, r1, r3, lsl #2 | |
1022 add r1, r1, #8 | |
1023 sub r0, r0, r3, lsl #4 | |
1024 add r0, r0, #8 | |
1025 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1026 sub r1, r1, r3, lsl #2 | |
1027 mov lr, r9 | |
1028 b put_h264_qpel8_hv_lowpass_l2_neon | |
1029 .endfunc | |
1030 | |
1031 function ff_put_h264_qpel8_mc10_neon, export=1 | |
1032 lowpass_const r3 | |
1033 mov r3, r1 | |
1034 sub r1, r1, #2 | |
1035 mov ip, #8 | |
1036 b put_h264_qpel8_h_lowpass_l2_neon | |
1037 .endfunc | |
1038 | |
1039 function ff_put_h264_qpel8_mc20_neon, export=1 | |
1040 lowpass_const r3 | |
1041 sub r1, r1, #2 | |
1042 mov r3, r2 | |
1043 mov ip, #8 | |
1044 b put_h264_qpel8_h_lowpass_neon | |
1045 .endfunc | |
1046 | |
1047 function ff_put_h264_qpel8_mc30_neon, export=1 | |
1048 lowpass_const r3 | |
1049 add r3, r1, #1 | |
1050 sub r1, r1, #2 | |
1051 mov ip, #8 | |
1052 b put_h264_qpel8_h_lowpass_l2_neon | |
1053 .endfunc | |
1054 | |
1055 function ff_put_h264_qpel8_mc01_neon, export=1 | |
1056 push {lr} | |
1057 mov ip, r1 | |
1058 put_h264_qpel8_mc01: | |
1059 lowpass_const r3 | |
1060 mov r3, r2 | |
1061 sub r1, r1, r2, lsl #1 | |
1062 vpush {d8-d15} | |
1063 bl put_h264_qpel8_v_lowpass_l2_neon | |
1064 vpop {d8-d15} | |
1065 pop {pc} | |
1066 .endfunc | |
1067 | |
1068 function ff_put_h264_qpel8_mc11_neon, export=1 | |
1069 push {r0, r1, r2, lr} | |
1070 put_h264_qpel8_mc11: | |
1071 lowpass_const r3 | |
1072 sub sp, sp, #64 | |
1073 mov r0, sp | |
1074 sub r1, r1, #2 | |
1075 mov r3, #8 | |
1076 mov ip, #8 | |
1077 vpush {d8-d15} | |
1078 bl put_h264_qpel8_h_lowpass_neon | |
1079 ldrd r0, [sp, #128] | |
1080 mov r3, r2 | |
1081 add ip, sp, #64 | |
1082 sub r1, r1, r2, lsl #1 | |
1083 mov r2, #8 | |
1084 bl put_h264_qpel8_v_lowpass_l2_neon | |
1085 vpop {d8-d15} | |
1086 add sp, sp, #76 | |
1087 pop {pc} | |
1088 .endfunc | |
1089 | |
1090 function ff_put_h264_qpel8_mc21_neon, export=1 | |
1091 push {r0, r1, r4, r10, r11, lr} | |
1092 put_h264_qpel8_mc21: | |
1093 lowpass_const r3 | |
1094 mov r11, sp | |
1095 bic sp, sp, #15 | |
1096 sub sp, sp, #(8*8+16*12) | |
1097 sub r1, r1, #2 | |
1098 mov r3, #8 | |
1099 mov r0, sp | |
1100 mov ip, #8 | |
1101 vpush {d8-d15} | |
1102 bl put_h264_qpel8_h_lowpass_neon | |
1103 mov r4, r0 | |
1104 ldrd r0, [r11] | |
1105 sub r1, r1, r2, lsl #1 | |
1106 sub r1, r1, #2 | |
1107 mov r3, r2 | |
1108 sub r2, r4, #64 | |
1109 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1110 vpop {d8-d15} | |
1111 add sp, r11, #8 | |
1112 pop {r4, r10, r11, pc} | |
1113 .endfunc | |
1114 | |
1115 function ff_put_h264_qpel8_mc31_neon, export=1 | |
1116 add r1, r1, #1 | |
1117 push {r0, r1, r2, lr} | |
1118 sub r1, r1, #1 | |
1119 b put_h264_qpel8_mc11 | |
1120 .endfunc | |
1121 | |
1122 function ff_put_h264_qpel8_mc02_neon, export=1 | |
1123 push {lr} | |
1124 lowpass_const r3 | |
1125 sub r1, r1, r2, lsl #1 | |
1126 mov r3, r2 | |
1127 vpush {d8-d15} | |
1128 bl put_h264_qpel8_v_lowpass_neon | |
1129 vpop {d8-d15} | |
1130 pop {pc} | |
1131 .endfunc | |
1132 | |
1133 function ff_put_h264_qpel8_mc12_neon, export=1 | |
1134 push {r0, r1, r4, r10, r11, lr} | |
1135 put_h264_qpel8_mc12: | |
1136 lowpass_const r3 | |
1137 mov r11, sp | |
1138 bic sp, sp, #15 | |
1139 sub sp, sp, #(8*8+16*12) | |
1140 sub r1, r1, r2, lsl #1 | |
1141 mov r3, r2 | |
1142 mov r2, #8 | |
1143 mov r0, sp | |
1144 vpush {d8-d15} | |
1145 bl put_h264_qpel8_v_lowpass_neon | |
1146 mov r4, r0 | |
1147 ldrd r0, [r11] | |
1148 sub r1, r1, r3, lsl #1 | |
1149 sub r1, r1, #2 | |
1150 sub r2, r4, #64 | |
1151 bl put_h264_qpel8_hv_lowpass_l2_neon | |
1152 vpop {d8-d15} | |
1153 add sp, r11, #8 | |
1154 pop {r4, r10, r11, pc} | |
1155 .endfunc | |
1156 | |
1157 function ff_put_h264_qpel8_mc22_neon, export=1 | |
1158 push {r4, r10, r11, lr} | |
1159 mov r11, sp | |
1160 bic sp, sp, #15 | |
1161 sub r1, r1, r2, lsl #1 | |
1162 sub r1, r1, #2 | |
1163 mov r3, r2 | |
1164 sub sp, sp, #(16*12) | |
1165 mov r4, sp | |
1166 vpush {d8-d15} | |
1167 bl put_h264_qpel8_hv_lowpass_neon | |
1168 vpop {d8-d15} | |
1169 mov sp, r11 | |
1170 pop {r4, r10, r11, pc} | |
1171 .endfunc | |
1172 | |
1173 function ff_put_h264_qpel8_mc32_neon, export=1 | |
1174 push {r0, r1, r4, r10, r11, lr} | |
1175 add r1, r1, #1 | |
1176 b put_h264_qpel8_mc12 | |
1177 .endfunc | |
1178 | |
1179 function ff_put_h264_qpel8_mc03_neon, export=1 | |
1180 push {lr} | |
1181 add ip, r1, r2 | |
1182 b put_h264_qpel8_mc01 | |
1183 .endfunc | |
1184 | |
1185 function ff_put_h264_qpel8_mc13_neon, export=1 | |
1186 push {r0, r1, r2, lr} | |
1187 add r1, r1, r2 | |
1188 b put_h264_qpel8_mc11 | |
1189 .endfunc | |
1190 | |
1191 function ff_put_h264_qpel8_mc23_neon, export=1 | |
1192 push {r0, r1, r4, r10, r11, lr} | |
1193 add r1, r1, r2 | |
1194 b put_h264_qpel8_mc21 | |
1195 .endfunc | |
1196 | |
1197 function ff_put_h264_qpel8_mc33_neon, export=1 | |
1198 add r1, r1, #1 | |
1199 push {r0, r1, r2, lr} | |
1200 add r1, r1, r2 | |
1201 sub r1, r1, #1 | |
1202 b put_h264_qpel8_mc11 | |
1203 .endfunc | |
1204 | |
1205 function ff_put_h264_qpel16_mc10_neon, export=1 | |
1206 lowpass_const r3 | |
1207 mov r3, r1 | |
1208 sub r1, r1, #2 | |
1209 b put_h264_qpel16_h_lowpass_l2_neon | |
1210 .endfunc | |
1211 | |
1212 function ff_put_h264_qpel16_mc20_neon, export=1 | |
1213 lowpass_const r3 | |
1214 sub r1, r1, #2 | |
1215 mov r3, r2 | |
1216 b put_h264_qpel16_h_lowpass_neon | |
1217 .endfunc | |
1218 | |
1219 function ff_put_h264_qpel16_mc30_neon, export=1 | |
1220 lowpass_const r3 | |
1221 add r3, r1, #1 | |
1222 sub r1, r1, #2 | |
1223 b put_h264_qpel16_h_lowpass_l2_neon | |
1224 .endfunc | |
1225 | |
1226 function ff_put_h264_qpel16_mc01_neon, export=1 | |
1227 push {r4, lr} | |
1228 mov ip, r1 | |
1229 put_h264_qpel16_mc01: | |
1230 lowpass_const r3 | |
1231 mov r3, r2 | |
1232 sub r1, r1, r2, lsl #1 | |
1233 vpush {d8-d15} | |
1234 bl put_h264_qpel16_v_lowpass_l2_neon | |
1235 vpop {d8-d15} | |
1236 pop {r4, pc} | |
1237 .endfunc | |
1238 | |
1239 function ff_put_h264_qpel16_mc11_neon, export=1 | |
1240 push {r0, r1, r4, lr} | |
1241 put_h264_qpel16_mc11: | |
1242 lowpass_const r3 | |
1243 sub sp, sp, #256 | |
1244 mov r0, sp | |
1245 sub r1, r1, #2 | |
1246 mov r3, #16 | |
1247 vpush {d8-d15} | |
1248 bl put_h264_qpel16_h_lowpass_neon | |
1249 add r0, sp, #256 | |
1250 ldrd r0, [r0, #64] | |
1251 mov r3, r2 | |
1252 add ip, sp, #64 | |
1253 sub r1, r1, r2, lsl #1 | |
1254 mov r2, #16 | |
1255 bl put_h264_qpel16_v_lowpass_l2_neon | |
1256 vpop {d8-d15} | |
1257 add sp, sp, #(256+8) | |
1258 pop {r4, pc} | |
1259 .endfunc | |
1260 | |
1261 function ff_put_h264_qpel16_mc21_neon, export=1 | |
1262 push {r0, r1, r4-r5, r9-r11, lr} | |
1263 put_h264_qpel16_mc21: | |
1264 lowpass_const r3 | |
1265 mov r11, sp | |
1266 bic sp, sp, #15 | |
1267 sub sp, sp, #(16*16+16*12) | |
1268 sub r1, r1, #2 | |
1269 mov r0, sp | |
1270 vpush {d8-d15} | |
1271 bl put_h264_qpel16_h_lowpass_neon_packed | |
1272 mov r4, r0 | |
1273 ldrd r0, [r11] | |
1274 sub r1, r1, r2, lsl #1 | |
1275 sub r1, r1, #2 | |
1276 mov r3, r2 | |
1277 bl put_h264_qpel16_hv_lowpass_l2_neon | |
1278 vpop {d8-d15} | |
1279 add sp, r11, #8 | |
1280 pop {r4-r5, r9-r11, pc} | |
1281 .endfunc | |
1282 | |
1283 function ff_put_h264_qpel16_mc31_neon, export=1 | |
1284 add r1, r1, #1 | |
1285 push {r0, r1, r4, lr} | |
1286 sub r1, r1, #1 | |
1287 b put_h264_qpel16_mc11 | |
1288 .endfunc | |
1289 | |
1290 function ff_put_h264_qpel16_mc02_neon, export=1 | |
1291 push {r4, lr} | |
1292 lowpass_const r3 | |
1293 sub r1, r1, r2, lsl #1 | |
1294 mov r3, r2 | |
1295 vpush {d8-d15} | |
1296 bl put_h264_qpel16_v_lowpass_neon | |
1297 vpop {d8-d15} | |
1298 pop {r4, pc} | |
1299 .endfunc | |
1300 | |
1301 function ff_put_h264_qpel16_mc12_neon, export=1 | |
1302 push {r0, r1, r4-r5, r9-r11, lr} | |
1303 put_h264_qpel16_mc12: | |
1304 lowpass_const r3 | |
1305 mov r11, sp | |
1306 bic sp, sp, #15 | |
1307 sub sp, sp, #(16*16+16*12) | |
1308 sub r1, r1, r2, lsl #1 | |
1309 mov r0, sp | |
1310 mov r3, r2 | |
1311 vpush {d8-d15} | |
1312 bl put_h264_qpel16_v_lowpass_neon_packed | |
1313 mov r4, r0 | |
1314 ldrd r0, [r11] | |
1315 sub r1, r1, r3, lsl #1 | |
1316 sub r1, r1, #2 | |
1317 mov r2, r3 | |
1318 bl put_h264_qpel16_hv_lowpass_l2_neon | |
1319 vpop {d8-d15} | |
1320 add sp, r11, #8 | |
1321 pop {r4-r5, r9-r11, pc} | |
1322 .endfunc | |
1323 | |
1324 function ff_put_h264_qpel16_mc22_neon, export=1 | |
1325 push {r4, r9-r11, lr} | |
1326 lowpass_const r3 | |
1327 mov r11, sp | |
1328 bic sp, sp, #15 | |
1329 sub r1, r1, r2, lsl #1 | |
1330 sub r1, r1, #2 | |
1331 mov r3, r2 | |
1332 sub sp, sp, #(16*12) | |
1333 mov r4, sp | |
1334 vpush {d8-d15} | |
1335 bl put_h264_qpel16_hv_lowpass_neon | |
1336 vpop {d8-d15} | |
1337 mov sp, r11 | |
1338 pop {r4, r9-r11, pc} | |
1339 .endfunc | |
1340 | |
1341 function ff_put_h264_qpel16_mc32_neon, export=1 | |
1342 push {r0, r1, r4-r5, r9-r11, lr} | |
1343 add r1, r1, #1 | |
1344 b put_h264_qpel16_mc12 | |
1345 .endfunc | |
1346 | |
1347 function ff_put_h264_qpel16_mc03_neon, export=1 | |
1348 push {r4, lr} | |
1349 add ip, r1, r2 | |
1350 b put_h264_qpel16_mc01 | |
1351 .endfunc | |
1352 | |
1353 function ff_put_h264_qpel16_mc13_neon, export=1 | |
1354 push {r0, r1, r4, lr} | |
1355 add r1, r1, r2 | |
1356 b put_h264_qpel16_mc11 | |
1357 .endfunc | |
1358 | |
1359 function ff_put_h264_qpel16_mc23_neon, export=1 | |
1360 push {r0, r1, r4-r5, r9-r11, lr} | |
1361 add r1, r1, r2 | |
1362 b put_h264_qpel16_mc21 | |
1363 .endfunc | |
1364 | |
1365 function ff_put_h264_qpel16_mc33_neon, export=1 | |
1366 add r1, r1, #1 | |
1367 push {r0, r1, r4, lr} | |
1368 add r1, r1, r2 | |
1369 sub r1, r1, #1 | |
1370 b put_h264_qpel16_mc11 | |
1371 .endfunc | |
8663 | 1372 |
1373 @ Biweighted prediction | |
1374 | |
1375 .macro biweight_16 macs, macd | |
1376 vdup.8 d0, r4 | |
1377 vdup.8 d1, r5 | |
1378 vmov q2, q8 | |
1379 vmov q3, q8 | |
1380 1: subs ip, ip, #2 | |
1381 vld1.8 {d20-d21},[r0,:128], r2 | |
1382 \macd q2, d0, d20 | |
1383 pld [r0] | |
1384 \macd q3, d0, d21 | |
1385 vld1.8 {d22-d23},[r1,:128], r2 | |
1386 \macs q2, d1, d22 | |
1387 pld [r1] | |
1388 \macs q3, d1, d23 | |
1389 vmov q12, q8 | |
1390 vld1.8 {d28-d29},[r0,:128], r2 | |
1391 vmov q13, q8 | |
1392 \macd q12, d0, d28 | |
1393 pld [r0] | |
1394 \macd q13, d0, d29 | |
1395 vld1.8 {d30-d31},[r1,:128], r2 | |
1396 \macs q12, d1, d30 | |
1397 pld [r1] | |
1398 \macs q13, d1, d31 | |
1399 vshl.s16 q2, q2, q9 | |
1400 vshl.s16 q3, q3, q9 | |
1401 vqmovun.s16 d4, q2 | |
1402 vqmovun.s16 d5, q3 | |
1403 vshl.s16 q12, q12, q9 | |
1404 vshl.s16 q13, q13, q9 | |
1405 vqmovun.s16 d24, q12 | |
1406 vqmovun.s16 d25, q13 | |
1407 vmov q3, q8 | |
1408 vst1.8 {d4- d5}, [r6,:128], r2 | |
1409 vmov q2, q8 | |
1410 vst1.8 {d24-d25},[r6,:128], r2 | |
1411 bne 1b | |
1412 pop {r4-r6, pc} | |
1413 .endm | |
1414 | |
1415 .macro biweight_8 macs, macd | |
1416 vdup.8 d0, r4 | |
1417 vdup.8 d1, r5 | |
1418 vmov q1, q8 | |
1419 vmov q10, q8 | |
1420 1: subs ip, ip, #2 | |
1421 vld1.8 {d4},[r0,:64], r2 | |
1422 \macd q1, d0, d4 | |
1423 pld [r0] | |
1424 vld1.8 {d5},[r1,:64], r2 | |
1425 \macs q1, d1, d5 | |
1426 pld [r1] | |
1427 vld1.8 {d6},[r0,:64], r2 | |
1428 \macd q10, d0, d6 | |
1429 pld [r0] | |
1430 vld1.8 {d7},[r1,:64], r2 | |
1431 \macs q10, d1, d7 | |
1432 pld [r1] | |
1433 vshl.s16 q1, q1, q9 | |
1434 vqmovun.s16 d2, q1 | |
1435 vshl.s16 q10, q10, q9 | |
1436 vqmovun.s16 d4, q10 | |
1437 vmov q10, q8 | |
1438 vst1.8 {d2},[r6,:64], r2 | |
1439 vmov q1, q8 | |
1440 vst1.8 {d4},[r6,:64], r2 | |
1441 bne 1b | |
1442 pop {r4-r6, pc} | |
1443 .endm | |
1444 | |
1445 .macro biweight_4 macs, macd | |
1446 vdup.8 d0, r4 | |
1447 vdup.8 d1, r5 | |
1448 vmov q1, q8 | |
1449 vmov q10, q8 | |
1450 1: subs ip, ip, #4 | |
1451 vld1.32 {d4[0]},[r0,:32], r2 | |
1452 vld1.32 {d4[1]},[r0,:32], r2 | |
1453 \macd q1, d0, d4 | |
1454 pld [r0] | |
1455 vld1.32 {d5[0]},[r1,:32], r2 | |
1456 vld1.32 {d5[1]},[r1,:32], r2 | |
1457 \macs q1, d1, d5 | |
1458 pld [r1] | |
1459 blt 2f | |
1460 vld1.32 {d6[0]},[r0,:32], r2 | |
1461 vld1.32 {d6[1]},[r0,:32], r2 | |
1462 \macd q10, d0, d6 | |
1463 pld [r0] | |
1464 vld1.32 {d7[0]},[r1,:32], r2 | |
1465 vld1.32 {d7[1]},[r1,:32], r2 | |
1466 \macs q10, d1, d7 | |
1467 pld [r1] | |
1468 vshl.s16 q1, q1, q9 | |
1469 vqmovun.s16 d2, q1 | |
1470 vshl.s16 q10, q10, q9 | |
1471 vqmovun.s16 d4, q10 | |
1472 vmov q10, q8 | |
1473 vst1.32 {d2[0]},[r6,:32], r2 | |
1474 vst1.32 {d2[1]},[r6,:32], r2 | |
1475 vmov q1, q8 | |
1476 vst1.32 {d4[0]},[r6,:32], r2 | |
1477 vst1.32 {d4[1]},[r6,:32], r2 | |
1478 bne 1b | |
1479 pop {r4-r6, pc} | |
1480 2: vshl.s16 q1, q1, q9 | |
1481 vqmovun.s16 d2, q1 | |
1482 vst1.32 {d2[0]},[r6,:32], r2 | |
1483 vst1.32 {d2[1]},[r6,:32], r2 | |
1484 pop {r4-r6, pc} | |
1485 .endm | |
1486 | |
1487 .macro biweight_func w | |
1488 function biweight_h264_pixels_\w\()_neon | |
1489 push {r4-r6, lr} | |
1490 add r4, sp, #16 | |
1491 ldm r4, {r4-r6} | |
1492 lsr lr, r4, #31 | |
1493 add r6, r6, #1 | |
1494 eors lr, lr, r5, lsr #30 | |
1495 orr r6, r6, #1 | |
1496 vdup.16 q9, r3 | |
1497 lsl r6, r6, r3 | |
1498 vmvn q9, q9 | |
1499 vdup.16 q8, r6 | |
1500 mov r6, r0 | |
1501 beq 10f | |
1502 subs lr, lr, #1 | |
1503 beq 20f | |
1504 subs lr, lr, #1 | |
1505 beq 30f | |
1506 b 40f | |
1507 10: biweight_\w vmlal.u8, vmlal.u8 | |
1508 20: rsb r4, r4, #0 | |
1509 biweight_\w vmlal.u8, vmlsl.u8 | |
1510 30: rsb r4, r4, #0 | |
1511 rsb r5, r5, #0 | |
1512 biweight_\w vmlsl.u8, vmlsl.u8 | |
1513 40: rsb r5, r5, #0 | |
1514 biweight_\w vmlsl.u8, vmlal.u8 | |
1515 .endfunc | |
1516 .endm | |
1517 | |
1518 .macro biweight_entry w, h, b=1 | |
1519 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1520 mov ip, #\h | |
1521 .if \b | |
1522 b biweight_h264_pixels_\w\()_neon | |
1523 .endif | |
1524 .endfunc | |
1525 .endm | |
1526 | |
1527 biweight_entry 16, 8 | |
1528 biweight_entry 16, 16, b=0 | |
1529 biweight_func 16 | |
1530 | |
1531 biweight_entry 8, 16 | |
1532 biweight_entry 8, 4 | |
1533 biweight_entry 8, 8, b=0 | |
1534 biweight_func 8 | |
1535 | |
1536 biweight_entry 4, 8 | |
1537 biweight_entry 4, 2 | |
1538 biweight_entry 4, 4, b=0 | |
1539 biweight_func 4 | |
8664 | 1540 |
1541 @ Weighted prediction | |
1542 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1543 .macro weight_16 add |
8664 | 1544 vdup.8 d0, r3 |
1545 1: subs ip, ip, #2 | |
1546 vld1.8 {d20-d21},[r0,:128], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1547 vmull.u8 q2, d0, d20 |
8664 | 1548 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1549 vmull.u8 q3, d0, d21 |
8664 | 1550 vld1.8 {d28-d29},[r0,:128], r1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1551 vmull.u8 q12, d0, d28 |
8664 | 1552 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1553 vmull.u8 q13, d0, d29 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1554 \add q2, q8, q2 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1555 vrshl.s16 q2, q2, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1556 \add q3, q8, q3 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1557 vrshl.s16 q3, q3, q9 |
8664 | 1558 vqmovun.s16 d4, q2 |
1559 vqmovun.s16 d5, q3 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1560 \add q12, q8, q12 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1561 vrshl.s16 q12, q12, q9 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1562 \add q13, q8, q13 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1563 vrshl.s16 q13, q13, q9 |
8664 | 1564 vqmovun.s16 d24, q12 |
1565 vqmovun.s16 d25, q13 | |
1566 vst1.8 {d4- d5}, [r4,:128], r1 | |
1567 vst1.8 {d24-d25},[r4,:128], r1 | |
1568 bne 1b | |
1569 pop {r4, pc} | |
1570 .endm | |
1571 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1572 .macro weight_8 add |
8664 | 1573 vdup.8 d0, r3 |
1574 1: subs ip, ip, #2 | |
1575 vld1.8 {d4},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1576 vmull.u8 q1, d0, d4 |
8664 | 1577 pld [r0] |
1578 vld1.8 {d6},[r0,:64], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1579 vmull.u8 q10, d0, d6 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1580 \add q1, q8, q1 |
8664 | 1581 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1582 vrshl.s16 q1, q1, q9 |
8664 | 1583 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1584 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1585 vrshl.s16 q10, q10, q9 |
8664 | 1586 vqmovun.s16 d4, q10 |
1587 vst1.8 {d2},[r4,:64], r1 | |
1588 vst1.8 {d4},[r4,:64], r1 | |
1589 bne 1b | |
1590 pop {r4, pc} | |
1591 .endm | |
1592 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1593 .macro weight_4 add |
8664 | 1594 vdup.8 d0, r3 |
1595 vmov q1, q8 | |
1596 vmov q10, q8 | |
1597 1: subs ip, ip, #4 | |
1598 vld1.32 {d4[0]},[r0,:32], r1 | |
1599 vld1.32 {d4[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1600 vmull.u8 q1, d0, d4 |
8664 | 1601 pld [r0] |
1602 blt 2f | |
1603 vld1.32 {d6[0]},[r0,:32], r1 | |
1604 vld1.32 {d6[1]},[r0,:32], r1 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1605 vmull.u8 q10, d0, d6 |
8664 | 1606 pld [r0] |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1607 \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1608 vrshl.s16 q1, q1, q9 |
8664 | 1609 vqmovun.s16 d2, q1 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1610 \add q10, q8, q10 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1611 vrshl.s16 q10, q10, q9 |
8664 | 1612 vqmovun.s16 d4, q10 |
1613 vmov q10, q8 | |
1614 vst1.32 {d2[0]},[r4,:32], r1 | |
1615 vst1.32 {d2[1]},[r4,:32], r1 | |
1616 vmov q1, q8 | |
1617 vst1.32 {d4[0]},[r4,:32], r1 | |
1618 vst1.32 {d4[1]},[r4,:32], r1 | |
1619 bne 1b | |
1620 pop {r4, pc} | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1621 2: \add q1, q8, q1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1622 vrshl.s16 q1, q1, q9 |
8664 | 1623 vqmovun.s16 d2, q1 |
1624 vst1.32 {d2[0]},[r4,:32], r1 | |
1625 vst1.32 {d2[1]},[r4,:32], r1 | |
1626 pop {r4, pc} | |
1627 .endm | |
1628 | |
1629 .macro weight_func w | |
1630 function weight_h264_pixels_\w\()_neon | |
1631 push {r4, lr} | |
1632 ldr r4, [sp, #8] | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1633 cmp r2, #1 |
8664 | 1634 lsl r4, r4, r2 |
1635 vdup.16 q8, r4 | |
1636 mov r4, r0 | |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1637 ble 20f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1638 rsb lr, r2, #1 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1639 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1640 cmp r3, #0 |
8664 | 1641 blt 10f |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1642 weight_\w vhadd.s16 |
8664 | 1643 10: rsb r3, r3, #0 |
9072
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1644 weight_\w vhsub.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1645 20: rsb lr, r2, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1646 vdup.16 q9, lr |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1647 cmp r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1648 blt 10f |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1649 weight_\w vadd.s16 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1650 10: rsb r3, r3, #0 |
d56b711c6c5d
ARM: fix corner-case overflow in H.264 weighted prediction
mru
parents:
8664
diff
changeset
|
1651 weight_\w vsub.s16 |
8664 | 1652 .endfunc |
1653 .endm | |
1654 | |
1655 .macro weight_entry w, h, b=1 | |
1656 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 | |
1657 mov ip, #\h | |
1658 .if \b | |
1659 b weight_h264_pixels_\w\()_neon | |
1660 .endif | |
1661 .endfunc | |
1662 .endm | |
1663 | |
1664 weight_entry 16, 8 | |
1665 weight_entry 16, 16, b=0 | |
1666 weight_func 16 | |
1667 | |
1668 weight_entry 8, 16 | |
1669 weight_entry 8, 4 | |
1670 weight_entry 8, 8, b=0 | |
1671 weight_func 8 | |
1672 | |
1673 weight_entry 4, 8 | |
1674 weight_entry 4, 2 | |
1675 weight_entry 4, 4, b=0 | |
1676 weight_func 4 |