Mercurial > libavcodec.hg
annotate arm/h264idct_neon.S @ 12483:0159a19bfff7 libavcodec
aacdec: Rework channel mapping compatibility hacks.
For a PCE based configuration map the channels solely based on tags.
For an indexed configuration map the channels solely based on position.
This works with all known exotic samples including al17, elem_id0, bad_concat,
and lfe_is_sce.
author | alexc |
---|---|
date | Fri, 10 Sep 2010 18:01:48 +0000 |
parents | ba14e3adeccd |
children | 8e7fd2d2193f |
rev | line source |
---|---|
8339 | 1 /* |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
8462 | 23 preserve8 |
8339 | 24 .text |
25 | |
26 function ff_h264_idct_add_neon, export=1 | |
27 vld1.64 {d0-d3}, [r1,:128] | |
28 | |
29 vswp d1, d2 | |
30 vadd.i16 d4, d0, d1 | |
31 vshr.s16 q8, q1, #1 | |
32 vsub.i16 d5, d0, d1 | |
33 vadd.i16 d6, d2, d17 | |
34 vsub.i16 d7, d16, d3 | |
35 vadd.i16 q0, q2, q3 | |
36 vsub.i16 q1, q2, q3 | |
37 | |
38 vtrn.16 d0, d1 | |
39 vtrn.16 d3, d2 | |
40 vtrn.32 d0, d3 | |
41 vtrn.32 d1, d2 | |
42 | |
43 vadd.i16 d4, d0, d3 | |
44 vld1.32 {d18[0]}, [r0,:32], r2 | |
45 vswp d1, d3 | |
46 vshr.s16 q8, q1, #1 | |
47 vld1.32 {d19[1]}, [r0,:32], r2 | |
48 vsub.i16 d5, d0, d1 | |
49 vld1.32 {d18[1]}, [r0,:32], r2 | |
50 vadd.i16 d6, d16, d3 | |
51 vld1.32 {d19[0]}, [r0,:32], r2 | |
52 vsub.i16 d7, d2, d17 | |
53 sub r0, r0, r2, lsl #2 | |
54 vadd.i16 q0, q2, q3 | |
55 vsub.i16 q1, q2, q3 | |
56 | |
10618 | 57 vrshr.s16 q0, q0, #6 |
58 vrshr.s16 q1, q1, #6 | |
8339 | 59 |
60 vaddw.u8 q0, q0, d18 | |
61 vaddw.u8 q1, q1, d19 | |
62 | |
63 vqmovun.s16 d0, q0 | |
64 vqmovun.s16 d1, q1 | |
65 | |
66 vst1.32 {d0[0]}, [r0,:32], r2 | |
67 vst1.32 {d1[1]}, [r0,:32], r2 | |
68 vst1.32 {d0[1]}, [r0,:32], r2 | |
69 vst1.32 {d1[0]}, [r0,:32], r2 | |
70 | |
71 bx lr | |
11443 | 72 endfunc |
8340 | 73 |
74 function ff_h264_idct_dc_add_neon, export=1 | |
75 vld1.16 {d2[],d3[]}, [r1,:16] | |
76 vrshr.s16 q1, q1, #6 | |
77 vld1.32 {d0[0]}, [r0,:32], r2 | |
78 vld1.32 {d0[1]}, [r0,:32], r2 | |
79 vaddw.u8 q2, q1, d0 | |
80 vld1.32 {d1[0]}, [r0,:32], r2 | |
81 vld1.32 {d1[1]}, [r0,:32], r2 | |
82 vaddw.u8 q1, q1, d1 | |
83 vqmovun.s16 d0, q2 | |
84 vqmovun.s16 d1, q1 | |
85 sub r0, r0, r2, lsl #2 | |
86 vst1.32 {d0[0]}, [r0,:32], r2 | |
87 vst1.32 {d0[1]}, [r0,:32], r2 | |
88 vst1.32 {d1[0]}, [r0,:32], r2 | |
89 vst1.32 {d1[1]}, [r0,:32], r2 | |
90 bx lr | |
11443 | 91 endfunc |
8462 | 92 |
93 function ff_h264_idct_add16_neon, export=1 | |
94 push {r4-r8,lr} | |
95 mov r4, r0 | |
96 mov r5, r1 | |
97 mov r1, r2 | |
98 mov r2, r3 | |
99 ldr r6, [sp, #24] | |
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8462
diff
changeset
|
100 movrel r7, scan8 |
8462 | 101 mov ip, #16 |
102 1: ldrb r8, [r7], #1 | |
103 ldr r0, [r5], #4 | |
104 ldrb r8, [r6, r8] | |
105 subs r8, r8, #1 | |
106 blt 2f | |
107 ldrsh lr, [r1] | |
108 add r0, r0, r4 | |
109 movne lr, #0 | |
110 cmp lr, #0 | |
111 adrne lr, ff_h264_idct_dc_add_neon | |
112 adreq lr, ff_h264_idct_add_neon | |
113 blx lr | |
114 2: subs ip, ip, #1 | |
115 add r1, r1, #32 | |
116 bne 1b | |
117 pop {r4-r8,pc} | |
11443 | 118 endfunc |
8462 | 119 |
120 function ff_h264_idct_add16intra_neon, export=1 | |
121 push {r4-r8,lr} | |
122 mov r4, r0 | |
123 mov r5, r1 | |
124 mov r1, r2 | |
125 mov r2, r3 | |
126 ldr r6, [sp, #24] | |
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8462
diff
changeset
|
127 movrel r7, scan8 |
8462 | 128 mov ip, #16 |
129 1: ldrb r8, [r7], #1 | |
130 ldr r0, [r5], #4 | |
131 ldrb r8, [r6, r8] | |
132 add r0, r0, r4 | |
133 cmp r8, #0 | |
134 ldrsh r8, [r1] | |
135 adrne lr, ff_h264_idct_add_neon | |
136 adreq lr, ff_h264_idct_dc_add_neon | |
137 cmpeq r8, #0 | |
138 blxne lr | |
139 subs ip, ip, #1 | |
140 add r1, r1, #32 | |
141 bne 1b | |
142 pop {r4-r8,pc} | |
11443 | 143 endfunc |
8462 | 144 |
145 function ff_h264_idct_add8_neon, export=1 | |
146 push {r4-r10,lr} | |
147 ldm r0, {r4,r9} | |
148 add r5, r1, #16*4 | |
149 add r1, r2, #16*32 | |
150 mov r2, r3 | |
151 ldr r6, [sp, #32] | |
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8462
diff
changeset
|
152 movrel r7, scan8+16 |
8462 | 153 mov ip, #8 |
154 1: ldrb r8, [r7], #1 | |
155 ldr r0, [r5], #4 | |
156 ldrb r8, [r6, r8] | |
157 tst ip, #4 | |
158 addeq r0, r0, r4 | |
159 addne r0, r0, r9 | |
160 cmp r8, #0 | |
161 ldrsh r8, [r1] | |
162 adrne lr, ff_h264_idct_add_neon | |
163 adreq lr, ff_h264_idct_dc_add_neon | |
164 cmpeq r8, #0 | |
165 blxne lr | |
166 subs ip, ip, #1 | |
167 add r1, r1, #32 | |
168 bne 1b | |
169 pop {r4-r10,pc} | |
11443 | 170 endfunc |
8462 | 171 |
12368 | 172 .macro idct8x8_cols pass |
173 .if \pass == 0 | |
174 qa .req q2 | |
175 qb .req q14 | |
176 vshr.s16 q2, q10, #1 | |
177 vadd.i16 q0, q8, q12 | |
178 vld1.16 {q14-q15},[r1,:128]! | |
179 vsub.i16 q1, q8, q12 | |
180 vshr.s16 q3, q14, #1 | |
181 vsub.i16 q2, q2, q14 | |
182 vadd.i16 q3, q3, q10 | |
183 .else | |
184 qa .req q14 | |
185 qb .req q2 | |
186 vtrn.32 q8, q10 | |
187 vtrn.16 q12, q13 | |
188 vtrn.32 q9, q11 | |
189 vtrn.32 q12, q2 | |
190 vtrn.32 q13, q15 | |
191 vswp d21, d4 | |
192 vshr.s16 q14, q10, #1 | |
193 vswp d17, d24 | |
194 vshr.s16 q3, q2, #1 | |
195 vswp d19, d26 | |
196 vadd.i16 q0, q8, q12 | |
197 vswp d23, d30 | |
198 vsub.i16 q1, q8, q12 | |
199 vsub.i16 q14, q14, q2 | |
200 vadd.i16 q3, q3, q10 | |
201 .endif | |
202 vadd.i16 q10, q1, qa | |
203 vsub.i16 q12, q1, qa | |
204 vadd.i16 q8, q0, q3 | |
205 vsub.i16 qb, q0, q3 | |
206 vsub.i16 q0, q13, q11 | |
207 vadd.i16 q1, q15, q9 | |
208 vsub.i16 qa, q15, q9 | |
209 vadd.i16 q3, q13, q11 | |
210 vsub.i16 q0, q0, q15 | |
211 vsub.i16 q1, q1, q11 | |
212 vadd.i16 qa, qa, q13 | |
213 vadd.i16 q3, q3, q9 | |
214 vshr.s16 q9, q9, #1 | |
215 vshr.s16 q11, q11, #1 | |
216 vshr.s16 q13, q13, #1 | |
217 vshr.s16 q15, q15, #1 | |
218 vsub.i16 q0, q0, q15 | |
219 vsub.i16 q1, q1, q11 | |
220 vadd.i16 qa, qa, q13 | |
221 vadd.i16 q3, q3, q9 | |
222 vshr.s16 q9, q0, #2 | |
223 vshr.s16 q11, q1, #2 | |
224 vshr.s16 q13, qa, #2 | |
225 vshr.s16 q15, q3, #2 | |
226 vsub.i16 q3, q3, q9 | |
227 vsub.i16 qa, q11, qa | |
228 vadd.i16 q1, q1, q13 | |
229 vadd.i16 q0, q0, q15 | |
230 .if \pass == 0 | |
231 vsub.i16 q15, q8, q3 | |
232 vadd.i16 q8, q8, q3 | |
233 vadd.i16 q9, q10, q2 | |
234 vsub.i16 q2, q10, q2 | |
235 vtrn.16 q8, q9 | |
236 vadd.i16 q10, q12, q1 | |
237 vtrn.16 q2, q15 | |
238 vadd.i16 q11, q14, q0 | |
239 vsub.i16 q13, q12, q1 | |
240 vtrn.16 q10, q11 | |
241 vsub.i16 q12, q14, q0 | |
242 .else | |
243 vsub.i16 q15, q8, q3 | |
244 vadd.i16 q8, q8, q3 | |
245 vadd.i16 q9, q10, q14 | |
246 vsub.i16 q14, q10, q14 | |
247 vadd.i16 q10, q12, q1 | |
248 vsub.i16 q13, q12, q1 | |
249 vadd.i16 q11, q2, q0 | |
250 vsub.i16 q12, q2, q0 | |
251 .endif | |
252 .unreq qa | |
253 .unreq qb | |
254 .endm | |
255 | |
256 function ff_h264_idct8_add_neon, export=1 | |
257 vld1.16 {q8-q9}, [r1,:128]! | |
258 vld1.16 {q10-q11},[r1,:128]! | |
259 vld1.16 {q12-q13},[r1,:128]! | |
260 | |
261 idct8x8_cols 0 | |
262 idct8x8_cols 1 | |
263 | |
264 mov r3, r0 | |
265 vrshr.s16 q8, q8, #6 | |
266 vld1.8 {d0}, [r0,:64], r2 | |
267 vrshr.s16 q9, q9, #6 | |
268 vld1.8 {d1}, [r0,:64], r2 | |
269 vrshr.s16 q10, q10, #6 | |
270 vld1.8 {d2}, [r0,:64], r2 | |
271 vrshr.s16 q11, q11, #6 | |
272 vld1.8 {d3}, [r0,:64], r2 | |
273 vrshr.s16 q12, q12, #6 | |
274 vld1.8 {d4}, [r0,:64], r2 | |
275 vrshr.s16 q13, q13, #6 | |
276 vld1.8 {d5}, [r0,:64], r2 | |
277 vrshr.s16 q14, q14, #6 | |
278 vld1.8 {d6}, [r0,:64], r2 | |
279 vrshr.s16 q15, q15, #6 | |
280 vld1.8 {d7}, [r0,:64], r2 | |
281 vaddw.u8 q8, q8, d0 | |
282 vaddw.u8 q9, q9, d1 | |
283 vaddw.u8 q10, q10, d2 | |
284 vqmovun.s16 d0, q8 | |
285 vaddw.u8 q11, q11, d3 | |
286 vqmovun.s16 d1, q9 | |
287 vaddw.u8 q12, q12, d4 | |
288 vqmovun.s16 d2, q10 | |
289 vst1.8 {d0}, [r3,:64], r2 | |
290 vaddw.u8 q13, q13, d5 | |
291 vqmovun.s16 d3, q11 | |
292 vst1.8 {d1}, [r3,:64], r2 | |
293 vaddw.u8 q14, q14, d6 | |
294 vqmovun.s16 d4, q12 | |
295 vst1.8 {d2}, [r3,:64], r2 | |
296 vaddw.u8 q15, q15, d7 | |
297 vqmovun.s16 d5, q13 | |
298 vst1.8 {d3}, [r3,:64], r2 | |
299 vqmovun.s16 d6, q14 | |
300 vqmovun.s16 d7, q15 | |
301 vst1.8 {d4}, [r3,:64], r2 | |
302 vst1.8 {d5}, [r3,:64], r2 | |
303 vst1.8 {d6}, [r3,:64], r2 | |
304 vst1.8 {d7}, [r3,:64], r2 | |
305 | |
306 sub r1, r1, #128 | |
307 bx lr | |
308 endfunc | |
309 | |
310 function ff_h264_idct8_dc_add_neon, export=1 | |
311 vld1.16 {d30[],d31[]},[r1,:16] | |
312 vld1.32 {d0}, [r0,:64], r2 | |
313 vrshr.s16 q15, q15, #6 | |
314 vld1.32 {d1}, [r0,:64], r2 | |
315 vld1.32 {d2}, [r0,:64], r2 | |
316 vaddw.u8 q8, q15, d0 | |
317 vld1.32 {d3}, [r0,:64], r2 | |
318 vaddw.u8 q9, q15, d1 | |
319 vld1.32 {d4}, [r0,:64], r2 | |
320 vaddw.u8 q10, q15, d2 | |
321 vld1.32 {d5}, [r0,:64], r2 | |
322 vaddw.u8 q11, q15, d3 | |
323 vld1.32 {d6}, [r0,:64], r2 | |
324 vaddw.u8 q12, q15, d4 | |
325 vld1.32 {d7}, [r0,:64], r2 | |
326 vaddw.u8 q13, q15, d5 | |
327 vaddw.u8 q14, q15, d6 | |
328 vaddw.u8 q15, q15, d7 | |
329 vqmovun.s16 d0, q8 | |
330 vqmovun.s16 d1, q9 | |
331 vqmovun.s16 d2, q10 | |
332 vqmovun.s16 d3, q11 | |
333 sub r0, r0, r2, lsl #3 | |
334 vst1.32 {d0}, [r0,:64], r2 | |
335 vqmovun.s16 d4, q12 | |
336 vst1.32 {d1}, [r0,:64], r2 | |
337 vqmovun.s16 d5, q13 | |
338 vst1.32 {d2}, [r0,:64], r2 | |
339 vqmovun.s16 d6, q14 | |
340 vst1.32 {d3}, [r0,:64], r2 | |
341 vqmovun.s16 d7, q15 | |
342 vst1.32 {d4}, [r0,:64], r2 | |
343 vst1.32 {d5}, [r0,:64], r2 | |
344 vst1.32 {d6}, [r0,:64], r2 | |
345 vst1.32 {d7}, [r0,:64], r2 | |
346 bx lr | |
347 endfunc | |
348 | |
349 function ff_h264_idct8_add4_neon, export=1 | |
350 push {r4-r8,lr} | |
351 mov r4, r0 | |
352 mov r5, r1 | |
353 mov r1, r2 | |
354 mov r2, r3 | |
355 ldr r6, [sp, #24] | |
356 movrel r7, scan8 | |
357 mov r12, #16 | |
358 1: ldrb r8, [r7], #4 | |
359 ldr r0, [r5], #16 | |
360 ldrb r8, [r6, r8] | |
361 subs r8, r8, #1 | |
362 blt 2f | |
363 ldrsh lr, [r1] | |
364 add r0, r0, r4 | |
365 movne lr, #0 | |
366 cmp lr, #0 | |
367 adrne lr, ff_h264_idct8_dc_add_neon | |
368 adreq lr, ff_h264_idct8_add_neon | |
369 blx lr | |
370 2: subs r12, r12, #4 | |
371 add r1, r1, #128 | |
372 bne 1b | |
373 pop {r4-r8,pc} | |
374 endfunc | |
375 | |
8462 | 376 .section .rodata |
377 scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 | |
378 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 | |
379 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 | |
380 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 | |
381 .byte 1+1*8, 2+1*8 | |
382 .byte 1+2*8, 2+2*8 | |
383 .byte 1+4*8, 2+4*8 | |
384 .byte 1+5*8, 2+5*8 |