comparison arm/h264pred_neon.S @ 10623:f52d07b169b4 libavcodec

ARM: NEON optimised H264 16x16, 8x8 pred
author mru
date Wed, 02 Dec 2009 14:56:45 +0000
parents
children 361a5fcb4393
comparison
equal deleted inserted replaced
10622:2474aceea736 10623:f52d07b169b4
1 /*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .macro ldcol.8 rd, rs, rt, n=8, hi=0
24 .if \n == 8 || \hi == 0
25 vld1.8 {\rd[0]}, [\rs], \rt
26 vld1.8 {\rd[1]}, [\rs], \rt
27 vld1.8 {\rd[2]}, [\rs], \rt
28 vld1.8 {\rd[3]}, [\rs], \rt
29 .endif
30 .if \n == 8 || \hi == 1
31 vld1.8 {\rd[4]}, [\rs], \rt
32 vld1.8 {\rd[5]}, [\rs], \rt
33 vld1.8 {\rd[6]}, [\rs], \rt
34 vld1.8 {\rd[7]}, [\rs], \rt
35 .endif
36 .endm
37
38 .macro add16x8 dq, dl, dh, rl, rh
39 vaddl.u8 \dq, \rl, \rh
40 vadd.u16 \dl, \dl, \dh
41 vpadd.u16 \dl, \dl, \dl
42 vpadd.u16 \dl, \dl, \dl
43 .endm
44
45 function ff_pred16x16_128_dc_neon, export=1
46 vmov.i8 q0, #128
47 b .L_pred16x16_dc_end
48 .endfunc
49
50 function ff_pred16x16_top_dc_neon, export=1
51 sub r2, r0, r1
52 vld1.8 {q0}, [r2,:128]
53 add16x8 q0, d0, d1, d0, d1
54 vrshrn.u16 d0, q0, #4
55 vdup.8 q0, d0[0]
56 b .L_pred16x16_dc_end
57 .endfunc
58
59 function ff_pred16x16_left_dc_neon, export=1
60 sub r2, r0, #1
61 ldcol.8 d0, r2, r1
62 ldcol.8 d1, r2, r1
63 add16x8 q0, d0, d1, d0, d1
64 vrshrn.u16 d0, q0, #4
65 vdup.8 q0, d0[0]
66 b .L_pred16x16_dc_end
67 .endfunc
68
69 function ff_pred16x16_dc_neon, export=1
70 sub r2, r0, r1
71 vld1.8 {q0}, [r2,:128]
72 sub r2, r0, #1
73 ldcol.8 d2, r2, r1
74 ldcol.8 d3, r2, r1
75 vaddl.u8 q0, d0, d1
76 vaddl.u8 q1, d2, d3
77 vadd.u16 q0, q0, q1
78 vadd.u16 d0, d0, d1
79 vpadd.u16 d0, d0, d0
80 vpadd.u16 d0, d0, d0
81 vrshrn.u16 d0, q0, #5
82 vdup.8 q0, d0[0]
83 .L_pred16x16_dc_end:
84 mov r3, #8
85 6: vst1.8 {q0}, [r0,:128], r1
86 vst1.8 {q0}, [r0,:128], r1
87 subs r3, r3, #1
88 bne 6b
89 bx lr
90 .endfunc
91
92 function ff_pred16x16_hor_neon, export=1
93 sub r2, r0, #1
94 mov r3, #16
95 1: vld1.8 {d0[],d1[]},[r2], r1
96 vst1.8 {q0}, [r0,:128], r1
97 subs r3, r3, #1
98 bne 1b
99 bx lr
100 .endfunc
101
102 function ff_pred16x16_vert_neon, export=1
103 sub r0, r0, r1
104 vld1.8 {q0}, [r0,:128], r1
105 mov r3, #8
106 1: vst1.8 {q0}, [r0,:128], r1
107 vst1.8 {q0}, [r0,:128], r1
108 subs r3, r3, #1
109 bne 1b
110 bx lr
111 .endfunc
112
113 function ff_pred16x16_plane_neon, export=1
114 sub r3, r0, r1
115 add r2, r3, #8
116 sub r3, r3, #1
117 vld1.8 {d0}, [r3]
118 vld1.8 {d2}, [r2,:64], r1
119 ldcol.8 d1, r3, r1
120 add r3, r3, r1
121 ldcol.8 d3, r3, r1
122 vrev64.8 q0, q0
123 vaddl.u8 q8, d2, d3
124 vsubl.u8 q2, d2, d0
125 vsubl.u8 q3, d3, d1
126 movrel r3, p16weight
127 vld1.8 {q0}, [r3,:128]
128 vmul.s16 q2, q2, q0
129 vmul.s16 q3, q3, q0
130 vadd.i16 d4, d4, d5
131 vadd.i16 d5, d6, d7
132 vpadd.i16 d4, d4, d5
133 vpadd.i16 d4, d4, d4
134 vshl.i16 d5, d4, #2
135 vaddl.s16 q2, d4, d5
136 vrshrn.s32 d4, q2, #6
137 mov r3, #0
138 vtrn.16 d4, d5
139 vadd.i16 d2, d4, d5
140 vshl.i16 d3, d2, #3
141 vrev64.16 d16, d17
142 vsub.i16 d3, d3, d2
143 vadd.i16 d16, d16, d0
144 vshl.i16 d2, d16, #4
145 vsub.i16 d2, d2, d3
146 vshl.i16 d3, d4, #4
147 vext.16 q0, q0, q0, #7
148 vsub.i16 d6, d5, d3
149 vmov.16 d0[0], r3
150 vmul.i16 q0, q0, d4[0]
151 vdup.16 q1, d2[0]
152 vdup.16 q2, d4[0]
153 vdup.16 q3, d6[0]
154 vshl.i16 q2, q2, #3
155 vadd.i16 q1, q1, q0
156 vadd.i16 q3, q3, q2
157 mov r3, #16
158 1:
159 vqshrun.s16 d0, q1, #5
160 vadd.i16 q1, q1, q2
161 vqshrun.s16 d1, q1, #5
162 vadd.i16 q1, q1, q3
163 vst1.8 {q0}, [r0,:128], r1
164 subs r3, r3, #1
165 bne 1b
166 bx lr
167 .endfunc
168
169 .section .rodata
170 .align 4
171 p16weight:
172 .short 1,2,3,4,5,6,7,8
173
174 .text
175
176 function ff_pred8x8_hor_neon, export=1
177 sub r2, r0, #1
178 mov r3, #8
179 1: vld1.8 {d0[]}, [r2], r1
180 vst1.8 {d0}, [r0,:64], r1
181 subs r3, r3, #1
182 bne 1b
183 bx lr
184 .endfunc
185
186 function ff_pred8x8_vert_neon, export=1
187 sub r0, r0, r1
188 vld1.8 {d0}, [r0,:64], r1
189 mov r3, #4
190 1: vst1.8 {d0}, [r0,:64], r1
191 vst1.8 {d0}, [r0,:64], r1
192 subs r3, r3, #1
193 bne 1b
194 bx lr
195 .endfunc
196
197 function ff_pred8x8_plane_neon, export=1
198 sub r3, r0, r1
199 add r2, r3, #4
200 sub r3, r3, #1
201 vld1.32 {d0[0]}, [r3]
202 vld1.32 {d2[0]}, [r2,:32], r1
203 ldcol.8 d0, r3, r1, 4, hi=1
204 add r3, r3, r1
205 ldcol.8 d3, r3, r1, 4
206 vaddl.u8 q8, d2, d3
207 vrev32.8 d0, d0
208 vtrn.32 d2, d3
209 vsubl.u8 q2, d2, d0
210 movrel r3, p16weight
211 vld1.16 {q0}, [r3,:128]
212 vmul.s16 d4, d4, d0
213 vmul.s16 d5, d5, d0
214 vpadd.i16 d4, d4, d5
215 vpaddl.s16 d4, d4
216 vshl.i32 d5, d4, #4
217 vadd.s32 d4, d4, d5
218 vrshrn.s32 d4, q2, #5
219 mov r3, #0
220 vtrn.16 d4, d5
221 vadd.i16 d2, d4, d5
222 vshl.i16 d3, d2, #2
223 vrev64.16 d16, d16
224 vsub.i16 d3, d3, d2
225 vadd.i16 d16, d16, d0
226 vshl.i16 d2, d16, #4
227 vsub.i16 d2, d2, d3
228 vshl.i16 d3, d4, #3
229 vext.16 q0, q0, q0, #7
230 vsub.i16 d6, d5, d3
231 vmov.16 d0[0], r3
232 vmul.i16 q0, q0, d4[0]
233 vdup.16 q1, d2[0]
234 vdup.16 q2, d4[0]
235 vdup.16 q3, d6[0]
236 vshl.i16 q2, q2, #3
237 vadd.i16 q1, q1, q0
238 vadd.i16 q3, q3, q2
239 mov r3, #8
240 1:
241 vqshrun.s16 d0, q1, #5
242 vadd.i16 q1, q1, q3
243 vst1.8 {d0}, [r0,:64], r1
244 subs r3, r3, #1
245 bne 1b
246 bx lr
247 .endfunc
248
249 function ff_pred8x8_128_dc_neon, export=1
250 vmov.i8 q0, #128
251 b .L_pred8x8_dc_end
252 .endfunc
253
254 function ff_pred8x8_top_dc_neon, export=1
255 sub r2, r0, r1
256 vld1.8 {d0}, [r2,:64]
257 vpaddl.u8 d0, d0
258 vpadd.u16 d0, d0, d0
259 vrshrn.u16 d0, q0, #2
260 vdup.8 d1, d0[1]
261 vdup.8 d0, d0[0]
262 vtrn.32 d0, d1
263 b .L_pred8x8_dc_end
264 .endfunc
265
266 function ff_pred8x8_left_dc_neon, export=1
267 sub r2, r0, #1
268 ldcol.8 d0, r2, r1
269 vpaddl.u8 d0, d0
270 vpadd.u16 d0, d0, d0
271 vrshrn.u16 d0, q0, #2
272 vdup.8 d1, d0[1]
273 vdup.8 d0, d0[0]
274 b .L_pred8x8_dc_end
275 .endfunc
276
277 function ff_pred8x8_dc_neon, export=1
278 sub r2, r0, r1
279 vld1.8 {d0}, [r2,:64]
280 sub r2, r0, #1
281 ldcol.8 d1, r2, r1
282 vtrn.32 d0, d1
283 vpaddl.u8 q0, q0
284 vpadd.u16 d0, d0, d1
285 vpadd.u16 d1, d0, d0
286 vrshrn.u16 d2, q0, #3
287 vrshrn.u16 d3, q0, #2
288 vdup.8 d0, d2[4]
289 vdup.8 d1, d3[3]
290 vdup.8 d4, d3[2]
291 vdup.8 d5, d2[5]
292 vtrn.32 q0, q2
293 .L_pred8x8_dc_end:
294 mov r3, #4
295 add r2, r0, r1, lsl #2
296 6: vst1.8 {d0}, [r0,:64], r1
297 vst1.8 {d1}, [r2,:64], r1
298 subs r3, r3, #1
299 bne 6b
300 bx lr
301 .endfunc
302
303 function ff_pred8x8_l0t_dc_neon, export=1
304 sub r2, r0, r1
305 vld1.8 {d0}, [r2,:64]
306 sub r2, r0, #1
307 ldcol.8 d1, r2, r1, 4
308 vtrn.32 d0, d1
309 vpaddl.u8 q0, q0
310 vpadd.u16 d0, d0, d1
311 vpadd.u16 d1, d0, d0
312 vrshrn.u16 d2, q0, #3
313 vrshrn.u16 d3, q0, #2
314 vdup.8 d0, d2[4]
315 vdup.8 d1, d3[0]
316 vdup.8 q2, d3[2]
317 vtrn.32 q0, q2
318 b .L_pred8x8_dc_end
319 .endfunc
320
321 function ff_pred8x8_l00_dc_neon, export=1
322 sub r2, r0, #1
323 ldcol.8 d0, r2, r1, 4
324 vpaddl.u8 d0, d0
325 vpadd.u16 d0, d0, d0
326 vrshrn.u16 d0, q0, #2
327 vmov.i8 d1, #128
328 vdup.8 d0, d0[0]
329 b .L_pred8x8_dc_end
330 .endfunc
331
332 function ff_pred8x8_0lt_dc_neon, export=1
333 sub r2, r0, r1
334 vld1.8 {d0}, [r2,:64]
335 add r2, r0, r1, lsl #2
336 sub r2, r2, #1
337 ldcol.8 d1, r2, r1, 4, hi=1
338 vtrn.32 d0, d1
339 vpaddl.u8 q0, q0
340 vpadd.u16 d0, d0, d1
341 vpadd.u16 d1, d0, d0
342 vrshrn.u16 d3, q0, #2
343 vrshrn.u16 d2, q0, #3
344 vdup.8 d0, d3[0]
345 vdup.8 d1, d3[3]
346 vdup.8 d4, d3[2]
347 vdup.8 d5, d2[5]
348 vtrn.32 q0, q2
349 b .L_pred8x8_dc_end
350 .endfunc
351
352 function ff_pred8x8_0l0_dc_neon, export=1
353 add r2, r0, r1, lsl #2
354 sub r2, r2, #1
355 ldcol.8 d1, r2, r1, 4
356 vpaddl.u8 d2, d1
357 vpadd.u16 d2, d2, d2
358 vrshrn.u16 d1, q1, #2
359 vmov.i8 d0, #128
360 vdup.8 d1, d1[0]
361 b .L_pred8x8_dc_end
362 .endfunc