Mercurial > libavcodec.hg
comparison armv4l/h264dsp_neon.S @ 8336:c8401acb05d1 libavcodec
ARM: NEON optimised {put,avg}_h264_chroma_mc[48]
author | mru |
---|---|
date | Mon, 15 Dec 2008 22:12:41 +0000 |
parents | |
children | d43b7f4c5c1c |
comparison
equal
deleted
inserted
replaced
8335:f19fe0cade86 | 8336:c8401acb05d1 |
---|---|
1 /* | |
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 #include "asm.S" | |
22 | |
23 .fpu neon | |
24 | |
25 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
26 .macro h264_chroma_mc8 avg=0 | |
27 push {r4-r7, lr} | |
28 ldrd r4, [sp, #20] | |
29 .if \avg | |
30 mov lr, r0 | |
31 .endif | |
32 pld [r1] | |
33 pld [r1, r2] | |
34 | |
35 muls r7, r4, r5 | |
36 rsb r6, r7, r5, lsl #3 | |
37 rsb ip, r7, r4, lsl #3 | |
38 sub r4, r7, r4, lsl #3 | |
39 sub r4, r4, r5, lsl #3 | |
40 add r4, r4, #64 | |
41 | |
42 beq 2f | |
43 | |
44 add r5, r1, r2 | |
45 | |
46 vdup.8 d0, r4 | |
47 lsl r4, r2, #1 | |
48 vdup.8 d1, ip | |
49 vld1.64 {d4, d5}, [r1], r4 | |
50 vdup.8 d2, r6 | |
51 vld1.64 {d6, d7}, [r5], r4 | |
52 vdup.8 d3, r7 | |
53 | |
54 vext.8 d5, d4, d5, #1 | |
55 vext.8 d7, d6, d7, #1 | |
56 | |
57 1: pld [r5] | |
58 vmull.u8 q8, d4, d0 | |
59 vmlal.u8 q8, d5, d1 | |
60 vld1.64 {d4, d5}, [r1], r4 | |
61 vmlal.u8 q8, d6, d2 | |
62 vext.8 d5, d4, d5, #1 | |
63 vmlal.u8 q8, d7, d3 | |
64 vmull.u8 q9, d6, d0 | |
65 subs r3, r3, #2 | |
66 vmlal.u8 q9, d7, d1 | |
67 vmlal.u8 q9, d4, d2 | |
68 vmlal.u8 q9, d5, d3 | |
69 vrshrn.u16 d16, q8, #6 | |
70 vld1.64 {d6, d7}, [r5], r4 | |
71 pld [r1] | |
72 vrshrn.u16 d17, q9, #6 | |
73 .if \avg | |
74 vld1.64 {d20}, [lr,:64], r2 | |
75 vld1.64 {d21}, [lr,:64], r2 | |
76 vrhadd.u8 q8, q8, q10 | |
77 .endif | |
78 vext.8 d7, d6, d7, #1 | |
79 vst1.64 {d16}, [r0,:64], r2 | |
80 vst1.64 {d17}, [r0,:64], r2 | |
81 bgt 1b | |
82 | |
83 pop {r4-r7, pc} | |
84 | |
85 2: tst r6, r6 | |
86 add ip, ip, r6 | |
87 vdup.8 d0, r4 | |
88 vdup.8 d1, ip | |
89 | |
90 beq 4f | |
91 | |
92 add r5, r1, r2 | |
93 lsl r4, r2, #1 | |
94 vld1.64 {d4}, [r1], r4 | |
95 vld1.64 {d6}, [r5], r4 | |
96 | |
97 3: pld [r5] | |
98 vmull.u8 q8, d4, d0 | |
99 vmlal.u8 q8, d6, d1 | |
100 vld1.64 {d4}, [r1], r4 | |
101 vmull.u8 q9, d6, d0 | |
102 vmlal.u8 q9, d4, d1 | |
103 vld1.64 {d6}, [r5], r4 | |
104 vrshrn.u16 d16, q8, #6 | |
105 vrshrn.u16 d17, q9, #6 | |
106 .if \avg | |
107 vld1.64 {d20}, [lr,:64], r2 | |
108 vld1.64 {d21}, [lr,:64], r2 | |
109 vrhadd.u8 q8, q8, q10 | |
110 .endif | |
111 subs r3, r3, #2 | |
112 pld [r1] | |
113 vst1.64 {d16}, [r0,:64], r2 | |
114 vst1.64 {d17}, [r0,:64], r2 | |
115 bgt 3b | |
116 | |
117 pop {r4-r7, pc} | |
118 | |
119 4: vld1.64 {d4, d5}, [r1], r2 | |
120 vld1.64 {d6, d7}, [r1], r2 | |
121 vext.8 d5, d4, d5, #1 | |
122 vext.8 d7, d6, d7, #1 | |
123 | |
124 5: pld [r1] | |
125 subs r3, r3, #2 | |
126 vmull.u8 q8, d4, d0 | |
127 vmlal.u8 q8, d5, d1 | |
128 vld1.64 {d4, d5}, [r1], r2 | |
129 vmull.u8 q9, d6, d0 | |
130 vmlal.u8 q9, d7, d1 | |
131 pld [r1] | |
132 vext.8 d5, d4, d5, #1 | |
133 vrshrn.u16 d16, q8, #6 | |
134 vrshrn.u16 d17, q9, #6 | |
135 .if \avg | |
136 vld1.64 {d20}, [lr,:64], r2 | |
137 vld1.64 {d21}, [lr,:64], r2 | |
138 vrhadd.u8 q8, q8, q10 | |
139 .endif | |
140 vld1.64 {d6, d7}, [r1], r2 | |
141 vext.8 d7, d6, d7, #1 | |
142 vst1.64 {d16}, [r0,:64], r2 | |
143 vst1.64 {d17}, [r0,:64], r2 | |
144 bgt 5b | |
145 | |
146 pop {r4-r7, pc} | |
147 .endm | |
148 | |
149 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
150 .macro h264_chroma_mc4 avg=0 | |
151 push {r4-r7, lr} | |
152 ldrd r4, [sp, #20] | |
153 .if \avg | |
154 mov lr, r0 | |
155 .endif | |
156 pld [r1] | |
157 pld [r1, r2] | |
158 | |
159 muls r7, r4, r5 | |
160 rsb r6, r7, r5, lsl #3 | |
161 rsb ip, r7, r4, lsl #3 | |
162 sub r4, r7, r4, lsl #3 | |
163 sub r4, r4, r5, lsl #3 | |
164 add r4, r4, #64 | |
165 | |
166 beq 2f | |
167 | |
168 add r5, r1, r2 | |
169 | |
170 vdup.8 d0, r4 | |
171 lsl r4, r2, #1 | |
172 vdup.8 d1, ip | |
173 vld1.64 {d4}, [r1], r4 | |
174 vdup.8 d2, r6 | |
175 vld1.64 {d6}, [r5], r4 | |
176 vdup.8 d3, r7 | |
177 | |
178 vext.8 d5, d4, d5, #1 | |
179 vext.8 d7, d6, d7, #1 | |
180 vtrn.32 d4, d5 | |
181 vtrn.32 d6, d7 | |
182 | |
183 vtrn.32 d0, d1 | |
184 vtrn.32 d2, d3 | |
185 | |
186 1: pld [r5] | |
187 vmull.u8 q8, d4, d0 | |
188 vmlal.u8 q8, d6, d2 | |
189 vld1.64 {d4}, [r1], r4 | |
190 vext.8 d5, d4, d5, #1 | |
191 vtrn.32 d4, d5 | |
192 vmull.u8 q9, d6, d0 | |
193 vmlal.u8 q9, d4, d2 | |
194 vld1.64 {d6}, [r5], r4 | |
195 vadd.i16 d16, d16, d17 | |
196 vadd.i16 d17, d18, d19 | |
197 vrshrn.u16 d16, q8, #6 | |
198 subs r3, r3, #2 | |
199 pld [r1] | |
200 .if \avg | |
201 vld1.32 {d20[0]}, [lr,:32], r2 | |
202 vld1.32 {d20[1]}, [lr,:32], r2 | |
203 vrhadd.u8 d16, d16, d20 | |
204 .endif | |
205 vext.8 d7, d6, d7, #1 | |
206 vtrn.32 d6, d7 | |
207 vst1.32 {d16[0]}, [r0,:32], r2 | |
208 vst1.32 {d16[1]}, [r0,:32], r2 | |
209 bgt 1b | |
210 | |
211 pop {r4-r7, pc} | |
212 | |
213 2: tst r6, r6 | |
214 add ip, ip, r6 | |
215 vdup.8 d0, r4 | |
216 vdup.8 d1, ip | |
217 vtrn.32 d0, d1 | |
218 | |
219 beq 4f | |
220 | |
221 vext.32 d1, d0, d1, #1 | |
222 add r5, r1, r2 | |
223 lsl r4, r2, #1 | |
224 vld1.32 {d4[0]}, [r1], r4 | |
225 vld1.32 {d4[1]}, [r5], r4 | |
226 | |
227 3: pld [r5] | |
228 vmull.u8 q8, d4, d0 | |
229 vld1.32 {d4[0]}, [r1], r4 | |
230 vmull.u8 q9, d4, d1 | |
231 vld1.32 {d4[1]}, [r5], r4 | |
232 vadd.i16 d16, d16, d17 | |
233 vadd.i16 d17, d18, d19 | |
234 vrshrn.u16 d16, q8, #6 | |
235 .if \avg | |
236 vld1.32 {d20[0]}, [lr,:32], r2 | |
237 vld1.32 {d20[1]}, [lr,:32], r2 | |
238 vrhadd.u8 d16, d16, d20 | |
239 .endif | |
240 subs r3, r3, #2 | |
241 pld [r1] | |
242 vst1.32 {d16[0]}, [r0,:32], r2 | |
243 vst1.32 {d16[1]}, [r0,:32], r2 | |
244 bgt 3b | |
245 | |
246 pop {r4-r7, pc} | |
247 | |
248 4: vld1.64 {d4}, [r1], r2 | |
249 vld1.64 {d6}, [r1], r2 | |
250 vext.8 d5, d4, d5, #1 | |
251 vext.8 d7, d6, d7, #1 | |
252 vtrn.32 d4, d5 | |
253 vtrn.32 d6, d7 | |
254 | |
255 5: vmull.u8 q8, d4, d0 | |
256 vmull.u8 q9, d6, d0 | |
257 subs r3, r3, #2 | |
258 vld1.64 {d4}, [r1], r2 | |
259 vext.8 d5, d4, d5, #1 | |
260 vtrn.32 d4, d5 | |
261 vadd.i16 d16, d16, d17 | |
262 vadd.i16 d17, d18, d19 | |
263 pld [r1] | |
264 vrshrn.u16 d16, q8, #6 | |
265 .if \avg | |
266 vld1.32 {d20[0]}, [lr,:32], r2 | |
267 vld1.32 {d20[1]}, [lr,:32], r2 | |
268 vrhadd.u8 d16, d16, d20 | |
269 .endif | |
270 vld1.64 {d6}, [r1], r2 | |
271 vext.8 d7, d6, d7, #1 | |
272 vtrn.32 d6, d7 | |
273 pld [r1] | |
274 vst1.32 {d16[0]}, [r0,:32], r2 | |
275 vst1.32 {d16[1]}, [r0,:32], r2 | |
276 bgt 5b | |
277 | |
278 pop {r4-r7, pc} | |
279 .endm | |
280 | |
281 .text | |
282 .align | |
283 | |
284 function ff_put_h264_chroma_mc8_neon, export=1 | |
285 h264_chroma_mc8 | |
286 .endfunc | |
287 | |
288 function ff_avg_h264_chroma_mc8_neon, export=1 | |
289 h264_chroma_mc8 avg=1 | |
290 .endfunc | |
291 | |
292 function ff_put_h264_chroma_mc4_neon, export=1 | |
293 h264_chroma_mc4 | |
294 .endfunc | |
295 | |
296 function ff_avg_h264_chroma_mc4_neon, export=1 | |
297 h264_chroma_mc4 avg=1 | |
298 .endfunc |