comparison armv4l/h264dsp_neon.S @ 8336:c8401acb05d1 libavcodec

ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
author mru
date Mon, 15 Dec 2008 22:12:41 +0000
parents
children d43b7f4c5c1c
comparison
equal deleted inserted replaced
8335:f19fe0cade86 8336:c8401acb05d1
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .fpu neon
24
25 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
26 .macro h264_chroma_mc8 avg=0
27 push {r4-r7, lr}
28 ldrd r4, [sp, #20]
29 .if \avg
30 mov lr, r0
31 .endif
32 pld [r1]
33 pld [r1, r2]
34
35 muls r7, r4, r5
36 rsb r6, r7, r5, lsl #3
37 rsb ip, r7, r4, lsl #3
38 sub r4, r7, r4, lsl #3
39 sub r4, r4, r5, lsl #3
40 add r4, r4, #64
41
42 beq 2f
43
44 add r5, r1, r2
45
46 vdup.8 d0, r4
47 lsl r4, r2, #1
48 vdup.8 d1, ip
49 vld1.64 {d4, d5}, [r1], r4
50 vdup.8 d2, r6
51 vld1.64 {d6, d7}, [r5], r4
52 vdup.8 d3, r7
53
54 vext.8 d5, d4, d5, #1
55 vext.8 d7, d6, d7, #1
56
57 1: pld [r5]
58 vmull.u8 q8, d4, d0
59 vmlal.u8 q8, d5, d1
60 vld1.64 {d4, d5}, [r1], r4
61 vmlal.u8 q8, d6, d2
62 vext.8 d5, d4, d5, #1
63 vmlal.u8 q8, d7, d3
64 vmull.u8 q9, d6, d0
65 subs r3, r3, #2
66 vmlal.u8 q9, d7, d1
67 vmlal.u8 q9, d4, d2
68 vmlal.u8 q9, d5, d3
69 vrshrn.u16 d16, q8, #6
70 vld1.64 {d6, d7}, [r5], r4
71 pld [r1]
72 vrshrn.u16 d17, q9, #6
73 .if \avg
74 vld1.64 {d20}, [lr,:64], r2
75 vld1.64 {d21}, [lr,:64], r2
76 vrhadd.u8 q8, q8, q10
77 .endif
78 vext.8 d7, d6, d7, #1
79 vst1.64 {d16}, [r0,:64], r2
80 vst1.64 {d17}, [r0,:64], r2
81 bgt 1b
82
83 pop {r4-r7, pc}
84
85 2: tst r6, r6
86 add ip, ip, r6
87 vdup.8 d0, r4
88 vdup.8 d1, ip
89
90 beq 4f
91
92 add r5, r1, r2
93 lsl r4, r2, #1
94 vld1.64 {d4}, [r1], r4
95 vld1.64 {d6}, [r5], r4
96
97 3: pld [r5]
98 vmull.u8 q8, d4, d0
99 vmlal.u8 q8, d6, d1
100 vld1.64 {d4}, [r1], r4
101 vmull.u8 q9, d6, d0
102 vmlal.u8 q9, d4, d1
103 vld1.64 {d6}, [r5], r4
104 vrshrn.u16 d16, q8, #6
105 vrshrn.u16 d17, q9, #6
106 .if \avg
107 vld1.64 {d20}, [lr,:64], r2
108 vld1.64 {d21}, [lr,:64], r2
109 vrhadd.u8 q8, q8, q10
110 .endif
111 subs r3, r3, #2
112 pld [r1]
113 vst1.64 {d16}, [r0,:64], r2
114 vst1.64 {d17}, [r0,:64], r2
115 bgt 3b
116
117 pop {r4-r7, pc}
118
119 4: vld1.64 {d4, d5}, [r1], r2
120 vld1.64 {d6, d7}, [r1], r2
121 vext.8 d5, d4, d5, #1
122 vext.8 d7, d6, d7, #1
123
124 5: pld [r1]
125 subs r3, r3, #2
126 vmull.u8 q8, d4, d0
127 vmlal.u8 q8, d5, d1
128 vld1.64 {d4, d5}, [r1], r2
129 vmull.u8 q9, d6, d0
130 vmlal.u8 q9, d7, d1
131 pld [r1]
132 vext.8 d5, d4, d5, #1
133 vrshrn.u16 d16, q8, #6
134 vrshrn.u16 d17, q9, #6
135 .if \avg
136 vld1.64 {d20}, [lr,:64], r2
137 vld1.64 {d21}, [lr,:64], r2
138 vrhadd.u8 q8, q8, q10
139 .endif
140 vld1.64 {d6, d7}, [r1], r2
141 vext.8 d7, d6, d7, #1
142 vst1.64 {d16}, [r0,:64], r2
143 vst1.64 {d17}, [r0,:64], r2
144 bgt 5b
145
146 pop {r4-r7, pc}
147 .endm
148
149 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
150 .macro h264_chroma_mc4 avg=0
151 push {r4-r7, lr}
152 ldrd r4, [sp, #20]
153 .if \avg
154 mov lr, r0
155 .endif
156 pld [r1]
157 pld [r1, r2]
158
159 muls r7, r4, r5
160 rsb r6, r7, r5, lsl #3
161 rsb ip, r7, r4, lsl #3
162 sub r4, r7, r4, lsl #3
163 sub r4, r4, r5, lsl #3
164 add r4, r4, #64
165
166 beq 2f
167
168 add r5, r1, r2
169
170 vdup.8 d0, r4
171 lsl r4, r2, #1
172 vdup.8 d1, ip
173 vld1.64 {d4}, [r1], r4
174 vdup.8 d2, r6
175 vld1.64 {d6}, [r5], r4
176 vdup.8 d3, r7
177
178 vext.8 d5, d4, d5, #1
179 vext.8 d7, d6, d7, #1
180 vtrn.32 d4, d5
181 vtrn.32 d6, d7
182
183 vtrn.32 d0, d1
184 vtrn.32 d2, d3
185
186 1: pld [r5]
187 vmull.u8 q8, d4, d0
188 vmlal.u8 q8, d6, d2
189 vld1.64 {d4}, [r1], r4
190 vext.8 d5, d4, d5, #1
191 vtrn.32 d4, d5
192 vmull.u8 q9, d6, d0
193 vmlal.u8 q9, d4, d2
194 vld1.64 {d6}, [r5], r4
195 vadd.i16 d16, d16, d17
196 vadd.i16 d17, d18, d19
197 vrshrn.u16 d16, q8, #6
198 subs r3, r3, #2
199 pld [r1]
200 .if \avg
201 vld1.32 {d20[0]}, [lr,:32], r2
202 vld1.32 {d20[1]}, [lr,:32], r2
203 vrhadd.u8 d16, d16, d20
204 .endif
205 vext.8 d7, d6, d7, #1
206 vtrn.32 d6, d7
207 vst1.32 {d16[0]}, [r0,:32], r2
208 vst1.32 {d16[1]}, [r0,:32], r2
209 bgt 1b
210
211 pop {r4-r7, pc}
212
213 2: tst r6, r6
214 add ip, ip, r6
215 vdup.8 d0, r4
216 vdup.8 d1, ip
217 vtrn.32 d0, d1
218
219 beq 4f
220
221 vext.32 d1, d0, d1, #1
222 add r5, r1, r2
223 lsl r4, r2, #1
224 vld1.32 {d4[0]}, [r1], r4
225 vld1.32 {d4[1]}, [r5], r4
226
227 3: pld [r5]
228 vmull.u8 q8, d4, d0
229 vld1.32 {d4[0]}, [r1], r4
230 vmull.u8 q9, d4, d1
231 vld1.32 {d4[1]}, [r5], r4
232 vadd.i16 d16, d16, d17
233 vadd.i16 d17, d18, d19
234 vrshrn.u16 d16, q8, #6
235 .if \avg
236 vld1.32 {d20[0]}, [lr,:32], r2
237 vld1.32 {d20[1]}, [lr,:32], r2
238 vrhadd.u8 d16, d16, d20
239 .endif
240 subs r3, r3, #2
241 pld [r1]
242 vst1.32 {d16[0]}, [r0,:32], r2
243 vst1.32 {d16[1]}, [r0,:32], r2
244 bgt 3b
245
246 pop {r4-r7, pc}
247
248 4: vld1.64 {d4}, [r1], r2
249 vld1.64 {d6}, [r1], r2
250 vext.8 d5, d4, d5, #1
251 vext.8 d7, d6, d7, #1
252 vtrn.32 d4, d5
253 vtrn.32 d6, d7
254
255 5: vmull.u8 q8, d4, d0
256 vmull.u8 q9, d6, d0
257 subs r3, r3, #2
258 vld1.64 {d4}, [r1], r2
259 vext.8 d5, d4, d5, #1
260 vtrn.32 d4, d5
261 vadd.i16 d16, d16, d17
262 vadd.i16 d17, d18, d19
263 pld [r1]
264 vrshrn.u16 d16, q8, #6
265 .if \avg
266 vld1.32 {d20[0]}, [lr,:32], r2
267 vld1.32 {d20[1]}, [lr,:32], r2
268 vrhadd.u8 d16, d16, d20
269 .endif
270 vld1.64 {d6}, [r1], r2
271 vext.8 d7, d6, d7, #1
272 vtrn.32 d6, d7
273 pld [r1]
274 vst1.32 {d16[0]}, [r0,:32], r2
275 vst1.32 {d16[1]}, [r0,:32], r2
276 bgt 5b
277
278 pop {r4-r7, pc}
279 .endm
280
281 .text
282 .align
283
284 function ff_put_h264_chroma_mc8_neon, export=1
285 h264_chroma_mc8
286 .endfunc
287
288 function ff_avg_h264_chroma_mc8_neon, export=1
289 h264_chroma_mc8 avg=1
290 .endfunc
291
292 function ff_put_h264_chroma_mc4_neon, export=1
293 h264_chroma_mc4
294 .endfunc
295
296 function ff_avg_h264_chroma_mc4_neon, export=1
297 h264_chroma_mc4 avg=1
298 .endfunc