Mercurial > libavcodec.hg
comparison arm/h264pred_neon.S @ 10623:f52d07b169b4 libavcodec
ARM: NEON optimised H264 16x16, 8x8 pred
author | mru |
---|---|
date | Wed, 02 Dec 2009 14:56:45 +0000 |
parents | |
children | 361a5fcb4393 |
comparison
equal
deleted
inserted
replaced
10622:2474aceea736 | 10623:f52d07b169b4 |
---|---|
1 /* | |
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
23 .macro ldcol.8 rd, rs, rt, n=8, hi=0 | |
24 .if \n == 8 || \hi == 0 | |
25 vld1.8 {\rd[0]}, [\rs], \rt | |
26 vld1.8 {\rd[1]}, [\rs], \rt | |
27 vld1.8 {\rd[2]}, [\rs], \rt | |
28 vld1.8 {\rd[3]}, [\rs], \rt | |
29 .endif | |
30 .if \n == 8 || \hi == 1 | |
31 vld1.8 {\rd[4]}, [\rs], \rt | |
32 vld1.8 {\rd[5]}, [\rs], \rt | |
33 vld1.8 {\rd[6]}, [\rs], \rt | |
34 vld1.8 {\rd[7]}, [\rs], \rt | |
35 .endif | |
36 .endm | |
37 | |
38 .macro add16x8 dq, dl, dh, rl, rh | |
39 vaddl.u8 \dq, \rl, \rh | |
40 vadd.u16 \dl, \dl, \dh | |
41 vpadd.u16 \dl, \dl, \dl | |
42 vpadd.u16 \dl, \dl, \dl | |
43 .endm | |
44 | |
45 function ff_pred16x16_128_dc_neon, export=1 | |
46 vmov.i8 q0, #128 | |
47 b .L_pred16x16_dc_end | |
48 .endfunc | |
49 | |
50 function ff_pred16x16_top_dc_neon, export=1 | |
51 sub r2, r0, r1 | |
52 vld1.8 {q0}, [r2,:128] | |
53 add16x8 q0, d0, d1, d0, d1 | |
54 vrshrn.u16 d0, q0, #4 | |
55 vdup.8 q0, d0[0] | |
56 b .L_pred16x16_dc_end | |
57 .endfunc | |
58 | |
59 function ff_pred16x16_left_dc_neon, export=1 | |
60 sub r2, r0, #1 | |
61 ldcol.8 d0, r2, r1 | |
62 ldcol.8 d1, r2, r1 | |
63 add16x8 q0, d0, d1, d0, d1 | |
64 vrshrn.u16 d0, q0, #4 | |
65 vdup.8 q0, d0[0] | |
66 b .L_pred16x16_dc_end | |
67 .endfunc | |
68 | |
69 function ff_pred16x16_dc_neon, export=1 | |
70 sub r2, r0, r1 | |
71 vld1.8 {q0}, [r2,:128] | |
72 sub r2, r0, #1 | |
73 ldcol.8 d2, r2, r1 | |
74 ldcol.8 d3, r2, r1 | |
75 vaddl.u8 q0, d0, d1 | |
76 vaddl.u8 q1, d2, d3 | |
77 vadd.u16 q0, q0, q1 | |
78 vadd.u16 d0, d0, d1 | |
79 vpadd.u16 d0, d0, d0 | |
80 vpadd.u16 d0, d0, d0 | |
81 vrshrn.u16 d0, q0, #5 | |
82 vdup.8 q0, d0[0] | |
83 .L_pred16x16_dc_end: | |
84 mov r3, #8 | |
85 6: vst1.8 {q0}, [r0,:128], r1 | |
86 vst1.8 {q0}, [r0,:128], r1 | |
87 subs r3, r3, #1 | |
88 bne 6b | |
89 bx lr | |
90 .endfunc | |
91 | |
92 function ff_pred16x16_hor_neon, export=1 | |
93 sub r2, r0, #1 | |
94 mov r3, #16 | |
95 1: vld1.8 {d0[],d1[]},[r2], r1 | |
96 vst1.8 {q0}, [r0,:128], r1 | |
97 subs r3, r3, #1 | |
98 bne 1b | |
99 bx lr | |
100 .endfunc | |
101 | |
102 function ff_pred16x16_vert_neon, export=1 | |
103 sub r0, r0, r1 | |
104 vld1.8 {q0}, [r0,:128], r1 | |
105 mov r3, #8 | |
106 1: vst1.8 {q0}, [r0,:128], r1 | |
107 vst1.8 {q0}, [r0,:128], r1 | |
108 subs r3, r3, #1 | |
109 bne 1b | |
110 bx lr | |
111 .endfunc | |
112 | |
113 function ff_pred16x16_plane_neon, export=1 | |
114 sub r3, r0, r1 | |
115 add r2, r3, #8 | |
116 sub r3, r3, #1 | |
117 vld1.8 {d0}, [r3] | |
118 vld1.8 {d2}, [r2,:64], r1 | |
119 ldcol.8 d1, r3, r1 | |
120 add r3, r3, r1 | |
121 ldcol.8 d3, r3, r1 | |
122 vrev64.8 q0, q0 | |
123 vaddl.u8 q8, d2, d3 | |
124 vsubl.u8 q2, d2, d0 | |
125 vsubl.u8 q3, d3, d1 | |
126 movrel r3, p16weight | |
127 vld1.8 {q0}, [r3,:128] | |
128 vmul.s16 q2, q2, q0 | |
129 vmul.s16 q3, q3, q0 | |
130 vadd.i16 d4, d4, d5 | |
131 vadd.i16 d5, d6, d7 | |
132 vpadd.i16 d4, d4, d5 | |
133 vpadd.i16 d4, d4, d4 | |
134 vshl.i16 d5, d4, #2 | |
135 vaddl.s16 q2, d4, d5 | |
136 vrshrn.s32 d4, q2, #6 | |
137 mov r3, #0 | |
138 vtrn.16 d4, d5 | |
139 vadd.i16 d2, d4, d5 | |
140 vshl.i16 d3, d2, #3 | |
141 vrev64.16 d16, d17 | |
142 vsub.i16 d3, d3, d2 | |
143 vadd.i16 d16, d16, d0 | |
144 vshl.i16 d2, d16, #4 | |
145 vsub.i16 d2, d2, d3 | |
146 vshl.i16 d3, d4, #4 | |
147 vext.16 q0, q0, q0, #7 | |
148 vsub.i16 d6, d5, d3 | |
149 vmov.16 d0[0], r3 | |
150 vmul.i16 q0, q0, d4[0] | |
151 vdup.16 q1, d2[0] | |
152 vdup.16 q2, d4[0] | |
153 vdup.16 q3, d6[0] | |
154 vshl.i16 q2, q2, #3 | |
155 vadd.i16 q1, q1, q0 | |
156 vadd.i16 q3, q3, q2 | |
157 mov r3, #16 | |
158 1: | |
159 vqshrun.s16 d0, q1, #5 | |
160 vadd.i16 q1, q1, q2 | |
161 vqshrun.s16 d1, q1, #5 | |
162 vadd.i16 q1, q1, q3 | |
163 vst1.8 {q0}, [r0,:128], r1 | |
164 subs r3, r3, #1 | |
165 bne 1b | |
166 bx lr | |
167 .endfunc | |
168 | |
169 .section .rodata | |
170 .align 4 | |
171 p16weight: | |
172 .short 1,2,3,4,5,6,7,8 | |
173 | |
174 .text | |
175 | |
176 function ff_pred8x8_hor_neon, export=1 | |
177 sub r2, r0, #1 | |
178 mov r3, #8 | |
179 1: vld1.8 {d0[]}, [r2], r1 | |
180 vst1.8 {d0}, [r0,:64], r1 | |
181 subs r3, r3, #1 | |
182 bne 1b | |
183 bx lr | |
184 .endfunc | |
185 | |
186 function ff_pred8x8_vert_neon, export=1 | |
187 sub r0, r0, r1 | |
188 vld1.8 {d0}, [r0,:64], r1 | |
189 mov r3, #4 | |
190 1: vst1.8 {d0}, [r0,:64], r1 | |
191 vst1.8 {d0}, [r0,:64], r1 | |
192 subs r3, r3, #1 | |
193 bne 1b | |
194 bx lr | |
195 .endfunc | |
196 | |
197 function ff_pred8x8_plane_neon, export=1 | |
198 sub r3, r0, r1 | |
199 add r2, r3, #4 | |
200 sub r3, r3, #1 | |
201 vld1.32 {d0[0]}, [r3] | |
202 vld1.32 {d2[0]}, [r2,:32], r1 | |
203 ldcol.8 d0, r3, r1, 4, hi=1 | |
204 add r3, r3, r1 | |
205 ldcol.8 d3, r3, r1, 4 | |
206 vaddl.u8 q8, d2, d3 | |
207 vrev32.8 d0, d0 | |
208 vtrn.32 d2, d3 | |
209 vsubl.u8 q2, d2, d0 | |
210 movrel r3, p16weight | |
211 vld1.16 {q0}, [r3,:128] | |
212 vmul.s16 d4, d4, d0 | |
213 vmul.s16 d5, d5, d0 | |
214 vpadd.i16 d4, d4, d5 | |
215 vpaddl.s16 d4, d4 | |
216 vshl.i32 d5, d4, #4 | |
217 vadd.s32 d4, d4, d5 | |
218 vrshrn.s32 d4, q2, #5 | |
219 mov r3, #0 | |
220 vtrn.16 d4, d5 | |
221 vadd.i16 d2, d4, d5 | |
222 vshl.i16 d3, d2, #2 | |
223 vrev64.16 d16, d16 | |
224 vsub.i16 d3, d3, d2 | |
225 vadd.i16 d16, d16, d0 | |
226 vshl.i16 d2, d16, #4 | |
227 vsub.i16 d2, d2, d3 | |
228 vshl.i16 d3, d4, #3 | |
229 vext.16 q0, q0, q0, #7 | |
230 vsub.i16 d6, d5, d3 | |
231 vmov.16 d0[0], r3 | |
232 vmul.i16 q0, q0, d4[0] | |
233 vdup.16 q1, d2[0] | |
234 vdup.16 q2, d4[0] | |
235 vdup.16 q3, d6[0] | |
236 vshl.i16 q2, q2, #3 | |
237 vadd.i16 q1, q1, q0 | |
238 vadd.i16 q3, q3, q2 | |
239 mov r3, #8 | |
240 1: | |
241 vqshrun.s16 d0, q1, #5 | |
242 vadd.i16 q1, q1, q3 | |
243 vst1.8 {d0}, [r0,:64], r1 | |
244 subs r3, r3, #1 | |
245 bne 1b | |
246 bx lr | |
247 .endfunc | |
248 | |
249 function ff_pred8x8_128_dc_neon, export=1 | |
250 vmov.i8 q0, #128 | |
251 b .L_pred8x8_dc_end | |
252 .endfunc | |
253 | |
254 function ff_pred8x8_top_dc_neon, export=1 | |
255 sub r2, r0, r1 | |
256 vld1.8 {d0}, [r2,:64] | |
257 vpaddl.u8 d0, d0 | |
258 vpadd.u16 d0, d0, d0 | |
259 vrshrn.u16 d0, q0, #2 | |
260 vdup.8 d1, d0[1] | |
261 vdup.8 d0, d0[0] | |
262 vtrn.32 d0, d1 | |
263 b .L_pred8x8_dc_end | |
264 .endfunc | |
265 | |
266 function ff_pred8x8_left_dc_neon, export=1 | |
267 sub r2, r0, #1 | |
268 ldcol.8 d0, r2, r1 | |
269 vpaddl.u8 d0, d0 | |
270 vpadd.u16 d0, d0, d0 | |
271 vrshrn.u16 d0, q0, #2 | |
272 vdup.8 d1, d0[1] | |
273 vdup.8 d0, d0[0] | |
274 b .L_pred8x8_dc_end | |
275 .endfunc | |
276 | |
277 function ff_pred8x8_dc_neon, export=1 | |
278 sub r2, r0, r1 | |
279 vld1.8 {d0}, [r2,:64] | |
280 sub r2, r0, #1 | |
281 ldcol.8 d1, r2, r1 | |
282 vtrn.32 d0, d1 | |
283 vpaddl.u8 q0, q0 | |
284 vpadd.u16 d0, d0, d1 | |
285 vpadd.u16 d1, d0, d0 | |
286 vrshrn.u16 d2, q0, #3 | |
287 vrshrn.u16 d3, q0, #2 | |
288 vdup.8 d0, d2[4] | |
289 vdup.8 d1, d3[3] | |
290 vdup.8 d4, d3[2] | |
291 vdup.8 d5, d2[5] | |
292 vtrn.32 q0, q2 | |
293 .L_pred8x8_dc_end: | |
294 mov r3, #4 | |
295 add r2, r0, r1, lsl #2 | |
296 6: vst1.8 {d0}, [r0,:64], r1 | |
297 vst1.8 {d1}, [r2,:64], r1 | |
298 subs r3, r3, #1 | |
299 bne 6b | |
300 bx lr | |
301 .endfunc | |
302 | |
303 function ff_pred8x8_l0t_dc_neon, export=1 | |
304 sub r2, r0, r1 | |
305 vld1.8 {d0}, [r2,:64] | |
306 sub r2, r0, #1 | |
307 ldcol.8 d1, r2, r1, 4 | |
308 vtrn.32 d0, d1 | |
309 vpaddl.u8 q0, q0 | |
310 vpadd.u16 d0, d0, d1 | |
311 vpadd.u16 d1, d0, d0 | |
312 vrshrn.u16 d2, q0, #3 | |
313 vrshrn.u16 d3, q0, #2 | |
314 vdup.8 d0, d2[4] | |
315 vdup.8 d1, d3[0] | |
316 vdup.8 q2, d3[2] | |
317 vtrn.32 q0, q2 | |
318 b .L_pred8x8_dc_end | |
319 .endfunc | |
320 | |
321 function ff_pred8x8_l00_dc_neon, export=1 | |
322 sub r2, r0, #1 | |
323 ldcol.8 d0, r2, r1, 4 | |
324 vpaddl.u8 d0, d0 | |
325 vpadd.u16 d0, d0, d0 | |
326 vrshrn.u16 d0, q0, #2 | |
327 vmov.i8 d1, #128 | |
328 vdup.8 d0, d0[0] | |
329 b .L_pred8x8_dc_end | |
330 .endfunc | |
331 | |
332 function ff_pred8x8_0lt_dc_neon, export=1 | |
333 sub r2, r0, r1 | |
334 vld1.8 {d0}, [r2,:64] | |
335 add r2, r0, r1, lsl #2 | |
336 sub r2, r2, #1 | |
337 ldcol.8 d1, r2, r1, 4, hi=1 | |
338 vtrn.32 d0, d1 | |
339 vpaddl.u8 q0, q0 | |
340 vpadd.u16 d0, d0, d1 | |
341 vpadd.u16 d1, d0, d0 | |
342 vrshrn.u16 d3, q0, #2 | |
343 vrshrn.u16 d2, q0, #3 | |
344 vdup.8 d0, d3[0] | |
345 vdup.8 d1, d3[3] | |
346 vdup.8 d4, d3[2] | |
347 vdup.8 d5, d2[5] | |
348 vtrn.32 q0, q2 | |
349 b .L_pred8x8_dc_end | |
350 .endfunc | |
351 | |
352 function ff_pred8x8_0l0_dc_neon, export=1 | |
353 add r2, r0, r1, lsl #2 | |
354 sub r2, r2, #1 | |
355 ldcol.8 d1, r2, r1, 4 | |
356 vpaddl.u8 d2, d1 | |
357 vpadd.u16 d2, d2, d2 | |
358 vrshrn.u16 d1, q1, #2 | |
359 vmov.i8 d0, #128 | |
360 vdup.8 d1, d1[0] | |
361 b .L_pred8x8_dc_end | |
362 .endfunc |