Mercurial > libavcodec.hg
annotate arm/dsputil_neon_s.S @ 10266:8695a5f212fc libavcodec
Factorize duplicated code in at1_imdct_block()
author | vitor |
---|---|
date | Thu, 24 Sep 2009 21:24:58 +0000 |
parents | 64dd9515b93b |
children | bcf5c5551b3c |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
10046 | 22 #include "config.h" |
8334 | 23 #include "asm.S" |
24 | |
25 preserve8 | |
26 .fpu neon | |
27 .text | |
28 | |
29 .macro pixels16 avg=0 | |
30 .if \avg | |
31 mov ip, r0 | |
32 .endif | |
33 1: vld1.64 {d0, d1}, [r1], r2 | |
34 vld1.64 {d2, d3}, [r1], r2 | |
35 vld1.64 {d4, d5}, [r1], r2 | |
36 pld [r1, r2, lsl #2] | |
37 vld1.64 {d6, d7}, [r1], r2 | |
38 pld [r1] | |
39 pld [r1, r2] | |
40 pld [r1, r2, lsl #1] | |
41 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
42 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 43 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
44 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 45 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
46 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 47 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
48 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 49 vrhadd.u8 q3, q3, q11 |
50 .endif | |
51 subs r3, r3, #4 | |
52 vst1.64 {d0, d1}, [r0,:128], r2 | |
53 vst1.64 {d2, d3}, [r0,:128], r2 | |
54 vst1.64 {d4, d5}, [r0,:128], r2 | |
55 vst1.64 {d6, d7}, [r0,:128], r2 | |
56 bne 1b | |
57 bx lr | |
58 .endm | |
59 | |
60 .macro pixels16_x2 vhadd=vrhadd.u8 | |
61 1: vld1.64 {d0-d2}, [r1], r2 | |
62 vld1.64 {d4-d6}, [r1], r2 | |
63 pld [r1] | |
64 pld [r1, r2] | |
65 subs r3, r3, #2 | |
66 vext.8 q1, q0, q1, #1 | |
67 \vhadd q0, q0, q1 | |
68 vext.8 q3, q2, q3, #1 | |
69 \vhadd q2, q2, q3 | |
70 vst1.64 {d0, d1}, [r0,:128], r2 | |
71 vst1.64 {d4, d5}, [r0,:128], r2 | |
72 bne 1b | |
73 bx lr | |
74 .endm | |
75 | |
76 .macro pixels16_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
77 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
78 vld1.64 {d2, d3}, [r1], r2 |
8334 | 79 1: subs r3, r3, #2 |
80 \vhadd q2, q0, q1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
81 vld1.64 {d0, d1}, [r1], r2 |
8334 | 82 \vhadd q3, q0, q1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
83 vld1.64 {d2, d3}, [r1], r2 |
8334 | 84 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
85 pld [r1, r2] |
8334 | 86 vst1.64 {d4, d5}, [r0,:128], r2 |
87 vst1.64 {d6, d7}, [r0,:128], r2 | |
88 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
89 bx lr |
8334 | 90 .endm |
91 | |
92 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d0-d2}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
94 vld1.64 {d4-d6}, [r1], r2 |
8334 | 95 .if \no_rnd |
96 vmov.i16 q13, #1 | |
97 .endif | |
98 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
99 pld [r1, r2] |
8334 | 100 vext.8 q1, q0, q1, #1 |
101 vext.8 q3, q2, q3, #1 | |
102 vaddl.u8 q8, d0, d2 | |
103 vaddl.u8 q10, d1, d3 | |
104 vaddl.u8 q9, d4, d6 | |
105 vaddl.u8 q11, d5, d7 | |
106 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
107 vld1.64 {d0-d2}, [r1], r2 |
8334 | 108 vadd.u16 q12, q8, q9 |
109 pld [r1] | |
110 .if \no_rnd | |
111 vadd.u16 q12, q12, q13 | |
112 .endif | |
113 vext.8 q15, q0, q1, #1 | |
114 vadd.u16 q1 , q10, q11 | |
115 \vshrn d28, q12, #2 | |
116 .if \no_rnd | |
117 vadd.u16 q1, q1, q13 | |
118 .endif | |
119 \vshrn d29, q1, #2 | |
120 vaddl.u8 q8, d0, d30 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
121 vld1.64 {d2-d4}, [r1], r2 |
8334 | 122 vaddl.u8 q10, d1, d31 |
123 vst1.64 {d28,d29}, [r0,:128], r2 | |
124 vadd.u16 q12, q8, q9 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
125 pld [r1, r2] |
8334 | 126 .if \no_rnd |
127 vadd.u16 q12, q12, q13 | |
128 .endif | |
129 vext.8 q2, q1, q2, #1 | |
130 vadd.u16 q0, q10, q11 | |
131 \vshrn d30, q12, #2 | |
132 .if \no_rnd | |
133 vadd.u16 q0, q0, q13 | |
134 .endif | |
135 \vshrn d31, q0, #2 | |
136 vaddl.u8 q9, d2, d4 | |
137 vaddl.u8 q11, d3, d5 | |
138 vst1.64 {d30,d31}, [r0,:128], r2 | |
139 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
140 bx lr |
8334 | 141 .endm |
142 | |
143 .macro pixels8 | |
144 1: vld1.64 {d0}, [r1], r2 | |
145 vld1.64 {d1}, [r1], r2 | |
146 vld1.64 {d2}, [r1], r2 | |
147 pld [r1, r2, lsl #2] | |
148 vld1.64 {d3}, [r1], r2 | |
149 pld [r1] | |
150 pld [r1, r2] | |
151 pld [r1, r2, lsl #1] | |
152 subs r3, r3, #4 | |
153 vst1.64 {d0}, [r0,:64], r2 | |
154 vst1.64 {d1}, [r0,:64], r2 | |
155 vst1.64 {d2}, [r0,:64], r2 | |
156 vst1.64 {d3}, [r0,:64], r2 | |
157 bne 1b | |
158 bx lr | |
159 .endm | |
160 | |
161 .macro pixels8_x2 vhadd=vrhadd.u8 | |
162 1: vld1.64 {d0, d1}, [r1], r2 | |
163 vext.8 d1, d0, d1, #1 | |
164 vld1.64 {d2, d3}, [r1], r2 | |
165 vext.8 d3, d2, d3, #1 | |
166 pld [r1] | |
167 pld [r1, r2] | |
168 subs r3, r3, #2 | |
169 vswp d1, d2 | |
170 \vhadd q0, q0, q1 | |
171 vst1.64 {d0}, [r0,:64], r2 | |
172 vst1.64 {d1}, [r0,:64], r2 | |
173 bne 1b | |
174 bx lr | |
175 .endm | |
176 | |
177 .macro pixels8_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
178 vld1.64 {d0}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
179 vld1.64 {d1}, [r1], r2 |
8334 | 180 1: subs r3, r3, #2 |
181 \vhadd d4, d0, d1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
182 vld1.64 {d0}, [r1], r2 |
8334 | 183 \vhadd d5, d0, d1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
184 vld1.64 {d1}, [r1], r2 |
8334 | 185 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
186 pld [r1, r2] |
8334 | 187 vst1.64 {d4}, [r0,:64], r2 |
188 vst1.64 {d5}, [r0,:64], r2 | |
189 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
190 bx lr |
8334 | 191 .endm |
192 | |
193 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
194 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
195 vld1.64 {d2, d3}, [r1], r2 |
8334 | 196 .if \no_rnd |
197 vmov.i16 q11, #1 | |
198 .endif | |
199 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
200 pld [r1, r2] |
8334 | 201 vext.8 d4, d0, d1, #1 |
202 vext.8 d6, d2, d3, #1 | |
203 vaddl.u8 q8, d0, d4 | |
204 vaddl.u8 q9, d2, d6 | |
205 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
206 vld1.64 {d0, d1}, [r1], r2 |
8334 | 207 pld [r1] |
208 vadd.u16 q10, q8, q9 | |
209 vext.8 d4, d0, d1, #1 | |
210 .if \no_rnd | |
211 vadd.u16 q10, q10, q11 | |
212 .endif | |
213 vaddl.u8 q8, d0, d4 | |
214 \vshrn d5, q10, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
215 vld1.64 {d2, d3}, [r1], r2 |
8334 | 216 vadd.u16 q10, q8, q9 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
217 pld [r1, r2] |
8334 | 218 .if \no_rnd |
219 vadd.u16 q10, q10, q11 | |
220 .endif | |
221 vst1.64 {d5}, [r0,:64], r2 | |
222 \vshrn d7, q10, #2 | |
223 vext.8 d6, d2, d3, #1 | |
224 vaddl.u8 q9, d2, d6 | |
225 vst1.64 {d7}, [r0,:64], r2 | |
226 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
227 bx lr |
8334 | 228 .endm |
229 | |
230 .macro pixfunc pfx name suf rnd_op args:vararg | |
231 function ff_\pfx\name\suf\()_neon, export=1 | |
232 \name \rnd_op \args | |
233 .endfunc | |
234 .endm | |
235 | |
236 .macro pixfunc2 pfx name args:vararg | |
237 pixfunc \pfx \name | |
238 pixfunc \pfx \name \args | |
239 .endm | |
240 | |
241 function ff_put_h264_qpel16_mc00_neon, export=1 | |
242 mov r3, #16 | |
243 .endfunc | |
244 | |
245 pixfunc put_ pixels16 | |
246 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
247 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
248 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
249 | |
250 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
251 mov r3, #16 | |
252 .endfunc | |
253 | |
254 pixfunc avg_ pixels16,, 1 | |
255 | |
256 function ff_put_h264_qpel8_mc00_neon, export=1 | |
257 mov r3, #8 | |
258 .endfunc | |
259 | |
260 pixfunc put_ pixels8 | |
261 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
262 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
263 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 264 |
9580 | 265 function ff_put_pixels_clamped_neon, export=1 |
266 vld1.64 {d16-d19}, [r0,:128]! | |
267 vqmovun.s16 d0, q8 | |
268 vld1.64 {d20-d23}, [r0,:128]! | |
269 vqmovun.s16 d1, q9 | |
270 vld1.64 {d24-d27}, [r0,:128]! | |
271 vqmovun.s16 d2, q10 | |
272 vld1.64 {d28-d31}, [r0,:128]! | |
273 vqmovun.s16 d3, q11 | |
274 vst1.64 {d0}, [r1,:64], r2 | |
275 vqmovun.s16 d4, q12 | |
276 vst1.64 {d1}, [r1,:64], r2 | |
277 vqmovun.s16 d5, q13 | |
278 vst1.64 {d2}, [r1,:64], r2 | |
279 vqmovun.s16 d6, q14 | |
280 vst1.64 {d3}, [r1,:64], r2 | |
281 vqmovun.s16 d7, q15 | |
282 vst1.64 {d4}, [r1,:64], r2 | |
283 vst1.64 {d5}, [r1,:64], r2 | |
284 vst1.64 {d6}, [r1,:64], r2 | |
285 vst1.64 {d7}, [r1,:64], r2 | |
286 bx lr | |
287 .endfunc | |
288 | |
9345 | 289 function ff_put_signed_pixels_clamped_neon, export=1 |
290 vmov.u8 d31, #128 | |
291 vld1.64 {d16-d17}, [r0,:128]! | |
292 vqmovn.s16 d0, q8 | |
293 vld1.64 {d18-d19}, [r0,:128]! | |
294 vqmovn.s16 d1, q9 | |
295 vld1.64 {d16-d17}, [r0,:128]! | |
296 vqmovn.s16 d2, q8 | |
297 vld1.64 {d18-d19}, [r0,:128]! | |
298 vadd.u8 d0, d0, d31 | |
299 vld1.64 {d20-d21}, [r0,:128]! | |
300 vadd.u8 d1, d1, d31 | |
301 vld1.64 {d22-d23}, [r0,:128]! | |
302 vadd.u8 d2, d2, d31 | |
303 vst1.64 {d0}, [r1,:64], r2 | |
304 vqmovn.s16 d3, q9 | |
305 vst1.64 {d1}, [r1,:64], r2 | |
306 vqmovn.s16 d4, q10 | |
307 vst1.64 {d2}, [r1,:64], r2 | |
308 vqmovn.s16 d5, q11 | |
309 vld1.64 {d24-d25}, [r0,:128]! | |
310 vadd.u8 d3, d3, d31 | |
311 vld1.64 {d26-d27}, [r0,:128]! | |
312 vadd.u8 d4, d4, d31 | |
313 vadd.u8 d5, d5, d31 | |
314 vst1.64 {d3}, [r1,:64], r2 | |
315 vqmovn.s16 d6, q12 | |
316 vst1.64 {d4}, [r1,:64], r2 | |
317 vqmovn.s16 d7, q13 | |
318 vst1.64 {d5}, [r1,:64], r2 | |
319 vadd.u8 d6, d6, d31 | |
320 vadd.u8 d7, d7, d31 | |
321 vst1.64 {d6}, [r1,:64], r2 | |
322 vst1.64 {d7}, [r1,:64], r2 | |
323 bx lr | |
324 .endfunc | |
325 | |
9344 | 326 function ff_add_pixels_clamped_neon, export=1 |
327 mov r3, r1 | |
328 vld1.64 {d16}, [r1,:64], r2 | |
329 vld1.64 {d0-d1}, [r0,:128]! | |
330 vaddw.u8 q0, q0, d16 | |
331 vld1.64 {d17}, [r1,:64], r2 | |
332 vld1.64 {d2-d3}, [r0,:128]! | |
333 vqmovun.s16 d0, q0 | |
334 vld1.64 {d18}, [r1,:64], r2 | |
335 vaddw.u8 q1, q1, d17 | |
336 vld1.64 {d4-d5}, [r0,:128]! | |
337 vaddw.u8 q2, q2, d18 | |
338 vst1.64 {d0}, [r3,:64], r2 | |
339 vqmovun.s16 d2, q1 | |
340 vld1.64 {d19}, [r1,:64], r2 | |
341 vld1.64 {d6-d7}, [r0,:128]! | |
342 vaddw.u8 q3, q3, d19 | |
343 vqmovun.s16 d4, q2 | |
344 vst1.64 {d2}, [r3,:64], r2 | |
345 vld1.64 {d16}, [r1,:64], r2 | |
346 vqmovun.s16 d6, q3 | |
347 vld1.64 {d0-d1}, [r0,:128]! | |
348 vaddw.u8 q0, q0, d16 | |
349 vst1.64 {d4}, [r3,:64], r2 | |
350 vld1.64 {d17}, [r1,:64], r2 | |
351 vld1.64 {d2-d3}, [r0,:128]! | |
352 vaddw.u8 q1, q1, d17 | |
353 vst1.64 {d6}, [r3,:64], r2 | |
354 vqmovun.s16 d0, q0 | |
355 vld1.64 {d18}, [r1,:64], r2 | |
356 vld1.64 {d4-d5}, [r0,:128]! | |
357 vaddw.u8 q2, q2, d18 | |
358 vst1.64 {d0}, [r3,:64], r2 | |
359 vqmovun.s16 d2, q1 | |
360 vld1.64 {d19}, [r1,:64], r2 | |
361 vqmovun.s16 d4, q2 | |
362 vld1.64 {d6-d7}, [r0,:128]! | |
363 vaddw.u8 q3, q3, d19 | |
364 vst1.64 {d2}, [r3,:64], r2 | |
365 vqmovun.s16 d6, q3 | |
366 vst1.64 {d4}, [r3,:64], r2 | |
367 vst1.64 {d6}, [r3,:64], r2 | |
368 bx lr | |
369 .endfunc | |
370 | |
8492 | 371 function ff_float_to_int16_neon, export=1 |
372 subs r2, r2, #8 | |
373 vld1.64 {d0-d1}, [r1,:128]! | |
374 vcvt.s32.f32 q8, q0, #16 | |
375 vld1.64 {d2-d3}, [r1,:128]! | |
376 vcvt.s32.f32 q9, q1, #16 | |
377 beq 3f | |
378 bics ip, r2, #15 | |
379 beq 2f | |
380 1: subs ip, ip, #16 | |
381 vshrn.s32 d4, q8, #16 | |
382 vld1.64 {d0-d1}, [r1,:128]! | |
383 vcvt.s32.f32 q0, q0, #16 | |
384 vshrn.s32 d5, q9, #16 | |
385 vld1.64 {d2-d3}, [r1,:128]! | |
386 vcvt.s32.f32 q1, q1, #16 | |
387 vshrn.s32 d6, q0, #16 | |
388 vst1.64 {d4-d5}, [r0,:128]! | |
389 vshrn.s32 d7, q1, #16 | |
390 vld1.64 {d16-d17},[r1,:128]! | |
391 vcvt.s32.f32 q8, q8, #16 | |
392 vld1.64 {d18-d19},[r1,:128]! | |
393 vcvt.s32.f32 q9, q9, #16 | |
394 vst1.64 {d6-d7}, [r0,:128]! | |
395 bne 1b | |
396 ands r2, r2, #15 | |
397 beq 3f | |
398 2: vld1.64 {d0-d1}, [r1,:128]! | |
399 vshrn.s32 d4, q8, #16 | |
400 vcvt.s32.f32 q0, q0, #16 | |
401 vld1.64 {d2-d3}, [r1,:128]! | |
402 vshrn.s32 d5, q9, #16 | |
403 vcvt.s32.f32 q1, q1, #16 | |
404 vshrn.s32 d6, q0, #16 | |
405 vst1.64 {d4-d5}, [r0,:128]! | |
406 vshrn.s32 d7, q1, #16 | |
407 vst1.64 {d6-d7}, [r0,:128]! | |
408 bx lr | |
409 3: vshrn.s32 d4, q8, #16 | |
410 vshrn.s32 d5, q9, #16 | |
411 vst1.64 {d4-d5}, [r0,:128]! | |
412 bx lr | |
413 .endfunc | |
414 | |
415 function ff_float_to_int16_interleave_neon, export=1 | |
416 cmp r3, #2 | |
417 ldrlt r1, [r1] | |
418 blt ff_float_to_int16_neon | |
419 bne 4f | |
420 | |
421 ldr r3, [r1] | |
422 ldr r1, [r1, #4] | |
423 | |
424 subs r2, r2, #8 | |
425 vld1.64 {d0-d1}, [r3,:128]! | |
426 vcvt.s32.f32 q8, q0, #16 | |
427 vld1.64 {d2-d3}, [r3,:128]! | |
428 vcvt.s32.f32 q9, q1, #16 | |
429 vld1.64 {d20-d21},[r1,:128]! | |
430 vcvt.s32.f32 q10, q10, #16 | |
431 vld1.64 {d22-d23},[r1,:128]! | |
432 vcvt.s32.f32 q11, q11, #16 | |
433 beq 3f | |
434 bics ip, r2, #15 | |
435 beq 2f | |
436 1: subs ip, ip, #16 | |
437 vld1.64 {d0-d1}, [r3,:128]! | |
438 vcvt.s32.f32 q0, q0, #16 | |
439 vsri.32 q10, q8, #16 | |
440 vld1.64 {d2-d3}, [r3,:128]! | |
441 vcvt.s32.f32 q1, q1, #16 | |
442 vld1.64 {d24-d25},[r1,:128]! | |
443 vcvt.s32.f32 q12, q12, #16 | |
444 vld1.64 {d26-d27},[r1,:128]! | |
445 vsri.32 q11, q9, #16 | |
446 vst1.64 {d20-d21},[r0,:128]! | |
447 vcvt.s32.f32 q13, q13, #16 | |
448 vst1.64 {d22-d23},[r0,:128]! | |
449 vsri.32 q12, q0, #16 | |
450 vld1.64 {d16-d17},[r3,:128]! | |
451 vsri.32 q13, q1, #16 | |
452 vst1.64 {d24-d25},[r0,:128]! | |
453 vcvt.s32.f32 q8, q8, #16 | |
454 vld1.64 {d18-d19},[r3,:128]! | |
455 vcvt.s32.f32 q9, q9, #16 | |
456 vld1.64 {d20-d21},[r1,:128]! | |
457 vcvt.s32.f32 q10, q10, #16 | |
458 vld1.64 {d22-d23},[r1,:128]! | |
459 vcvt.s32.f32 q11, q11, #16 | |
460 vst1.64 {d26-d27},[r0,:128]! | |
461 bne 1b | |
462 ands r2, r2, #15 | |
463 beq 3f | |
464 2: vsri.32 q10, q8, #16 | |
465 vld1.64 {d0-d1}, [r3,:128]! | |
466 vcvt.s32.f32 q0, q0, #16 | |
467 vld1.64 {d2-d3}, [r3,:128]! | |
468 vcvt.s32.f32 q1, q1, #16 | |
469 vld1.64 {d24-d25},[r1,:128]! | |
470 vcvt.s32.f32 q12, q12, #16 | |
471 vsri.32 q11, q9, #16 | |
472 vld1.64 {d26-d27},[r1,:128]! | |
473 vcvt.s32.f32 q13, q13, #16 | |
474 vst1.64 {d20-d21},[r0,:128]! | |
475 vsri.32 q12, q0, #16 | |
476 vst1.64 {d22-d23},[r0,:128]! | |
477 vsri.32 q13, q1, #16 | |
478 vst1.64 {d24-d27},[r0,:128]! | |
479 bx lr | |
480 3: vsri.32 q10, q8, #16 | |
481 vsri.32 q11, q9, #16 | |
482 vst1.64 {d20-d23},[r0,:128]! | |
483 bx lr | |
484 | |
485 4: push {r4-r8,lr} | |
486 cmp r3, #4 | |
487 lsl ip, r3, #1 | |
488 blt 4f | |
489 | |
490 @ 4 channels | |
491 5: ldmia r1!, {r4-r7} | |
492 mov lr, r2 | |
493 mov r8, r0 | |
494 vld1.64 {d16-d17},[r4,:128]! | |
495 vcvt.s32.f32 q8, q8, #16 | |
496 vld1.64 {d18-d19},[r5,:128]! | |
497 vcvt.s32.f32 q9, q9, #16 | |
498 vld1.64 {d20-d21},[r6,:128]! | |
499 vcvt.s32.f32 q10, q10, #16 | |
500 vld1.64 {d22-d23},[r7,:128]! | |
501 vcvt.s32.f32 q11, q11, #16 | |
502 6: subs lr, lr, #8 | |
503 vld1.64 {d0-d1}, [r4,:128]! | |
504 vcvt.s32.f32 q0, q0, #16 | |
505 vsri.32 q9, q8, #16 | |
506 vld1.64 {d2-d3}, [r5,:128]! | |
507 vcvt.s32.f32 q1, q1, #16 | |
508 vsri.32 q11, q10, #16 | |
509 vld1.64 {d4-d5}, [r6,:128]! | |
510 vcvt.s32.f32 q2, q2, #16 | |
511 vzip.32 d18, d22 | |
512 vld1.64 {d6-d7}, [r7,:128]! | |
513 vcvt.s32.f32 q3, q3, #16 | |
514 vzip.32 d19, d23 | |
515 vst1.64 {d18}, [r8], ip | |
516 vsri.32 q1, q0, #16 | |
517 vst1.64 {d22}, [r8], ip | |
518 vsri.32 q3, q2, #16 | |
519 vst1.64 {d19}, [r8], ip | |
520 vzip.32 d2, d6 | |
521 vst1.64 {d23}, [r8], ip | |
522 vzip.32 d3, d7 | |
523 beq 7f | |
524 vld1.64 {d16-d17},[r4,:128]! | |
525 vcvt.s32.f32 q8, q8, #16 | |
526 vst1.64 {d2}, [r8], ip | |
527 vld1.64 {d18-d19},[r5,:128]! | |
528 vcvt.s32.f32 q9, q9, #16 | |
529 vst1.64 {d6}, [r8], ip | |
530 vld1.64 {d20-d21},[r6,:128]! | |
531 vcvt.s32.f32 q10, q10, #16 | |
532 vst1.64 {d3}, [r8], ip | |
533 vld1.64 {d22-d23},[r7,:128]! | |
534 vcvt.s32.f32 q11, q11, #16 | |
535 vst1.64 {d7}, [r8], ip | |
536 b 6b | |
537 7: vst1.64 {d2}, [r8], ip | |
538 vst1.64 {d6}, [r8], ip | |
539 vst1.64 {d3}, [r8], ip | |
540 vst1.64 {d7}, [r8], ip | |
541 subs r3, r3, #4 | |
542 popeq {r4-r8,pc} | |
543 cmp r3, #4 | |
544 add r0, r0, #8 | |
545 bge 5b | |
546 | |
547 @ 2 channels | |
548 4: cmp r3, #2 | |
549 blt 4f | |
550 ldmia r1!, {r4-r5} | |
551 mov lr, r2 | |
552 mov r8, r0 | |
553 tst lr, #8 | |
554 vld1.64 {d16-d17},[r4,:128]! | |
555 vcvt.s32.f32 q8, q8, #16 | |
556 vld1.64 {d18-d19},[r5,:128]! | |
557 vcvt.s32.f32 q9, q9, #16 | |
558 vld1.64 {d20-d21},[r4,:128]! | |
559 vcvt.s32.f32 q10, q10, #16 | |
560 vld1.64 {d22-d23},[r5,:128]! | |
561 vcvt.s32.f32 q11, q11, #16 | |
562 beq 6f | |
563 subs lr, lr, #8 | |
564 beq 7f | |
565 vsri.32 d18, d16, #16 | |
566 vsri.32 d19, d17, #16 | |
567 vld1.64 {d16-d17},[r4,:128]! | |
568 vcvt.s32.f32 q8, q8, #16 | |
569 vst1.32 {d18[0]}, [r8], ip | |
570 vsri.32 d22, d20, #16 | |
571 vst1.32 {d18[1]}, [r8], ip | |
572 vsri.32 d23, d21, #16 | |
573 vst1.32 {d19[0]}, [r8], ip | |
574 vst1.32 {d19[1]}, [r8], ip | |
575 vld1.64 {d18-d19},[r5,:128]! | |
576 vcvt.s32.f32 q9, q9, #16 | |
577 vst1.32 {d22[0]}, [r8], ip | |
578 vst1.32 {d22[1]}, [r8], ip | |
579 vld1.64 {d20-d21},[r4,:128]! | |
580 vcvt.s32.f32 q10, q10, #16 | |
581 vst1.32 {d23[0]}, [r8], ip | |
582 vst1.32 {d23[1]}, [r8], ip | |
583 vld1.64 {d22-d23},[r5,:128]! | |
584 vcvt.s32.f32 q11, q11, #16 | |
585 6: subs lr, lr, #16 | |
586 vld1.64 {d0-d1}, [r4,:128]! | |
587 vcvt.s32.f32 q0, q0, #16 | |
588 vsri.32 d18, d16, #16 | |
589 vld1.64 {d2-d3}, [r5,:128]! | |
590 vcvt.s32.f32 q1, q1, #16 | |
591 vsri.32 d19, d17, #16 | |
592 vld1.64 {d4-d5}, [r4,:128]! | |
593 vcvt.s32.f32 q2, q2, #16 | |
594 vld1.64 {d6-d7}, [r5,:128]! | |
595 vcvt.s32.f32 q3, q3, #16 | |
596 vst1.32 {d18[0]}, [r8], ip | |
597 vsri.32 d22, d20, #16 | |
598 vst1.32 {d18[1]}, [r8], ip | |
599 vsri.32 d23, d21, #16 | |
600 vst1.32 {d19[0]}, [r8], ip | |
601 vsri.32 d2, d0, #16 | |
602 vst1.32 {d19[1]}, [r8], ip | |
603 vsri.32 d3, d1, #16 | |
604 vst1.32 {d22[0]}, [r8], ip | |
605 vsri.32 d6, d4, #16 | |
606 vst1.32 {d22[1]}, [r8], ip | |
607 vsri.32 d7, d5, #16 | |
608 vst1.32 {d23[0]}, [r8], ip | |
609 vst1.32 {d23[1]}, [r8], ip | |
610 beq 6f | |
611 vld1.64 {d16-d17},[r4,:128]! | |
612 vcvt.s32.f32 q8, q8, #16 | |
613 vst1.32 {d2[0]}, [r8], ip | |
614 vst1.32 {d2[1]}, [r8], ip | |
615 vld1.64 {d18-d19},[r5,:128]! | |
616 vcvt.s32.f32 q9, q9, #16 | |
617 vst1.32 {d3[0]}, [r8], ip | |
618 vst1.32 {d3[1]}, [r8], ip | |
619 vld1.64 {d20-d21},[r4,:128]! | |
620 vcvt.s32.f32 q10, q10, #16 | |
621 vst1.32 {d6[0]}, [r8], ip | |
622 vst1.32 {d6[1]}, [r8], ip | |
623 vld1.64 {d22-d23},[r5,:128]! | |
624 vcvt.s32.f32 q11, q11, #16 | |
625 vst1.32 {d7[0]}, [r8], ip | |
626 vst1.32 {d7[1]}, [r8], ip | |
627 bgt 6b | |
628 6: vst1.32 {d2[0]}, [r8], ip | |
629 vst1.32 {d2[1]}, [r8], ip | |
630 vst1.32 {d3[0]}, [r8], ip | |
631 vst1.32 {d3[1]}, [r8], ip | |
632 vst1.32 {d6[0]}, [r8], ip | |
633 vst1.32 {d6[1]}, [r8], ip | |
634 vst1.32 {d7[0]}, [r8], ip | |
635 vst1.32 {d7[1]}, [r8], ip | |
636 b 8f | |
637 7: vsri.32 d18, d16, #16 | |
638 vsri.32 d19, d17, #16 | |
639 vst1.32 {d18[0]}, [r8], ip | |
640 vsri.32 d22, d20, #16 | |
641 vst1.32 {d18[1]}, [r8], ip | |
642 vsri.32 d23, d21, #16 | |
643 vst1.32 {d19[0]}, [r8], ip | |
644 vst1.32 {d19[1]}, [r8], ip | |
645 vst1.32 {d22[0]}, [r8], ip | |
646 vst1.32 {d22[1]}, [r8], ip | |
647 vst1.32 {d23[0]}, [r8], ip | |
648 vst1.32 {d23[1]}, [r8], ip | |
649 8: subs r3, r3, #2 | |
650 add r0, r0, #4 | |
651 popeq {r4-r8,pc} | |
652 | |
653 @ 1 channel | |
654 4: ldr r4, [r1],#4 | |
655 tst r2, #8 | |
656 mov lr, r2 | |
657 mov r5, r0 | |
658 vld1.64 {d0-d1}, [r4,:128]! | |
659 vcvt.s32.f32 q0, q0, #16 | |
660 vld1.64 {d2-d3}, [r4,:128]! | |
661 vcvt.s32.f32 q1, q1, #16 | |
662 bne 8f | |
663 6: subs lr, lr, #16 | |
664 vld1.64 {d4-d5}, [r4,:128]! | |
665 vcvt.s32.f32 q2, q2, #16 | |
666 vld1.64 {d6-d7}, [r4,:128]! | |
667 vcvt.s32.f32 q3, q3, #16 | |
668 vst1.16 {d0[1]}, [r5,:16], ip | |
669 vst1.16 {d0[3]}, [r5,:16], ip | |
670 vst1.16 {d1[1]}, [r5,:16], ip | |
671 vst1.16 {d1[3]}, [r5,:16], ip | |
672 vst1.16 {d2[1]}, [r5,:16], ip | |
673 vst1.16 {d2[3]}, [r5,:16], ip | |
674 vst1.16 {d3[1]}, [r5,:16], ip | |
675 vst1.16 {d3[3]}, [r5,:16], ip | |
676 beq 7f | |
677 vld1.64 {d0-d1}, [r4,:128]! | |
678 vcvt.s32.f32 q0, q0, #16 | |
679 vld1.64 {d2-d3}, [r4,:128]! | |
680 vcvt.s32.f32 q1, q1, #16 | |
681 7: vst1.16 {d4[1]}, [r5,:16], ip | |
682 vst1.16 {d4[3]}, [r5,:16], ip | |
683 vst1.16 {d5[1]}, [r5,:16], ip | |
684 vst1.16 {d5[3]}, [r5,:16], ip | |
685 vst1.16 {d6[1]}, [r5,:16], ip | |
686 vst1.16 {d6[3]}, [r5,:16], ip | |
687 vst1.16 {d7[1]}, [r5,:16], ip | |
688 vst1.16 {d7[3]}, [r5,:16], ip | |
689 bgt 6b | |
690 pop {r4-r8,pc} | |
691 8: subs lr, lr, #8 | |
692 vst1.16 {d0[1]}, [r5,:16], ip | |
693 vst1.16 {d0[3]}, [r5,:16], ip | |
694 vst1.16 {d1[1]}, [r5,:16], ip | |
695 vst1.16 {d1[3]}, [r5,:16], ip | |
696 vst1.16 {d2[1]}, [r5,:16], ip | |
697 vst1.16 {d2[3]}, [r5,:16], ip | |
698 vst1.16 {d3[1]}, [r5,:16], ip | |
699 vst1.16 {d3[3]}, [r5,:16], ip | |
700 popeq {r4-r8,pc} | |
701 vld1.64 {d0-d1}, [r4,:128]! | |
702 vcvt.s32.f32 q0, q0, #16 | |
703 vld1.64 {d2-d3}, [r4,:128]! | |
704 vcvt.s32.f32 q1, q1, #16 | |
705 b 6b | |
706 .endfunc | |
8697 | 707 |
708 function ff_vector_fmul_neon, export=1 | |
709 mov r3, r0 | |
710 subs r2, r2, #8 | |
711 vld1.64 {d0-d3}, [r0,:128]! | |
712 vld1.64 {d4-d7}, [r1,:128]! | |
713 vmul.f32 q8, q0, q2 | |
714 vmul.f32 q9, q1, q3 | |
715 beq 3f | |
716 bics ip, r2, #15 | |
717 beq 2f | |
718 1: subs ip, ip, #16 | |
719 vld1.64 {d0-d1}, [r0,:128]! | |
720 vld1.64 {d4-d5}, [r1,:128]! | |
721 vmul.f32 q10, q0, q2 | |
722 vld1.64 {d2-d3}, [r0,:128]! | |
723 vld1.64 {d6-d7}, [r1,:128]! | |
724 vmul.f32 q11, q1, q3 | |
725 vst1.64 {d16-d19},[r3,:128]! | |
726 vld1.64 {d0-d1}, [r0,:128]! | |
727 vld1.64 {d4-d5}, [r1,:128]! | |
728 vmul.f32 q8, q0, q2 | |
729 vld1.64 {d2-d3}, [r0,:128]! | |
730 vld1.64 {d6-d7}, [r1,:128]! | |
731 vmul.f32 q9, q1, q3 | |
732 vst1.64 {d20-d23},[r3,:128]! | |
733 bne 1b | |
734 ands r2, r2, #15 | |
735 beq 3f | |
736 2: vld1.64 {d0-d1}, [r0,:128]! | |
737 vld1.64 {d4-d5}, [r1,:128]! | |
738 vst1.64 {d16-d17},[r3,:128]! | |
739 vmul.f32 q8, q0, q2 | |
740 vld1.64 {d2-d3}, [r0,:128]! | |
741 vld1.64 {d6-d7}, [r1,:128]! | |
742 vst1.64 {d18-d19},[r3,:128]! | |
743 vmul.f32 q9, q1, q3 | |
744 3: vst1.64 {d16-d19},[r3,:128]! | |
745 bx lr | |
746 .endfunc | |
8698 | 747 |
748 function ff_vector_fmul_window_neon, export=1 | |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
749 VFP vdup.32 q8, d0[0] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
750 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
8698 | 751 push {r4,r5,lr} |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
752 VFP ldr lr, [sp, #12] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
753 NOVFP ldr lr, [sp, #16] |
8698 | 754 sub r2, r2, #8 |
755 sub r5, lr, #2 | |
756 add r2, r2, r5, lsl #2 | |
757 add r4, r3, r5, lsl #3 | |
758 add ip, r0, r5, lsl #3 | |
759 mov r5, #-16 | |
760 vld1.64 {d0,d1}, [r1,:128]! | |
761 vld1.64 {d2,d3}, [r2,:128], r5 | |
762 vld1.64 {d4,d5}, [r3,:128]! | |
763 vld1.64 {d6,d7}, [r4,:128], r5 | |
764 1: subs lr, lr, #4 | |
765 vmov q11, q8 | |
766 vmla.f32 d22, d0, d4 | |
767 vmov q10, q8 | |
768 vmla.f32 d23, d1, d5 | |
769 vrev64.32 q3, q3 | |
770 vmla.f32 d20, d0, d7 | |
771 vrev64.32 q1, q1 | |
772 vmla.f32 d21, d1, d6 | |
773 beq 2f | |
774 vmla.f32 d22, d3, d7 | |
775 vld1.64 {d0,d1}, [r1,:128]! | |
776 vmla.f32 d23, d2, d6 | |
777 vld1.64 {d18,d19},[r2,:128], r5 | |
778 vmls.f32 d20, d3, d4 | |
779 vld1.64 {d24,d25},[r3,:128]! | |
780 vmls.f32 d21, d2, d5 | |
781 vld1.64 {d6,d7}, [r4,:128], r5 | |
782 vmov q1, q9 | |
783 vrev64.32 q11, q11 | |
784 vmov q2, q12 | |
785 vswp d22, d23 | |
786 vst1.64 {d20,d21},[r0,:128]! | |
787 vst1.64 {d22,d23},[ip,:128], r5 | |
788 b 1b | |
789 2: vmla.f32 d22, d3, d7 | |
790 vmla.f32 d23, d2, d6 | |
791 vmls.f32 d20, d3, d4 | |
792 vmls.f32 d21, d2, d5 | |
793 vrev64.32 q11, q11 | |
794 vswp d22, d23 | |
795 vst1.64 {d20,d21},[r0,:128]! | |
796 vst1.64 {d22,d23},[ip,:128], r5 | |
797 pop {r4,r5,pc} | |
798 .endfunc | |
10046 | 799 |
800 #if CONFIG_VORBIS_DECODER | |
801 function ff_vorbis_inverse_coupling_neon, export=1 | |
802 vmov.i32 q10, #1<<31 | |
803 subs r2, r2, #4 | |
804 mov r3, r0 | |
805 mov r12, r1 | |
806 beq 3f | |
807 | |
808 vld1.32 {d24-d25},[r1,:128]! | |
809 vld1.32 {d22-d23},[r0,:128]! | |
810 vcle.s32 q8, q12, #0 | |
811 vand q9, q11, q10 | |
812 veor q12, q12, q9 | |
813 vand q2, q12, q8 | |
814 vbic q3, q12, q8 | |
815 vadd.f32 q12, q11, q2 | |
816 vsub.f32 q11, q11, q3 | |
817 1: vld1.32 {d2-d3}, [r1,:128]! | |
818 vld1.32 {d0-d1}, [r0,:128]! | |
819 vcle.s32 q8, q1, #0 | |
820 vand q9, q0, q10 | |
821 veor q1, q1, q9 | |
822 vst1.32 {d24-d25},[r3, :128]! | |
823 vst1.32 {d22-d23},[r12,:128]! | |
824 vand q2, q1, q8 | |
825 vbic q3, q1, q8 | |
826 vadd.f32 q1, q0, q2 | |
827 vsub.f32 q0, q0, q3 | |
828 subs r2, r2, #8 | |
829 ble 2f | |
830 vld1.32 {d24-d25},[r1,:128]! | |
831 vld1.32 {d22-d23},[r0,:128]! | |
832 vcle.s32 q8, q12, #0 | |
833 vand q9, q11, q10 | |
834 veor q12, q12, q9 | |
835 vst1.32 {d2-d3}, [r3, :128]! | |
836 vst1.32 {d0-d1}, [r12,:128]! | |
837 vand q2, q12, q8 | |
838 vbic q3, q12, q8 | |
839 vadd.f32 q12, q11, q2 | |
840 vsub.f32 q11, q11, q3 | |
841 b 1b | |
842 | |
843 2: vst1.32 {d2-d3}, [r3, :128]! | |
844 vst1.32 {d0-d1}, [r12,:128]! | |
845 bxlt lr | |
846 | |
847 3: vld1.32 {d2-d3}, [r1,:128] | |
848 vld1.32 {d0-d1}, [r0,:128] | |
849 vcle.s32 q8, q1, #0 | |
850 vand q9, q0, q10 | |
851 veor q1, q1, q9 | |
852 vand q2, q1, q8 | |
853 vbic q3, q1, q8 | |
854 vadd.f32 q1, q0, q2 | |
855 vsub.f32 q0, q0, q3 | |
856 vst1.32 {d2-d3}, [r0,:128]! | |
857 vst1.32 {d0-d1}, [r1,:128]! | |
858 bx lr | |
859 .endfunc | |
860 #endif | |
10221 | 861 |
862 function ff_vector_fmul_scalar_neon, export=1 | |
863 VFP len .req r2 | |
864 NOVFP len .req r3 | |
865 VFP vdup.32 q8, d0[0] | |
866 NOVFP vdup.32 q8, r2 | |
867 bics r12, len, #15 | |
868 beq 3f | |
869 vld1.32 {q0},[r1,:128]! | |
870 vld1.32 {q1},[r1,:128]! | |
871 1: vmul.f32 q0, q0, q8 | |
872 vld1.32 {q2},[r1,:128]! | |
873 vmul.f32 q1, q1, q8 | |
874 vld1.32 {q3},[r1,:128]! | |
875 vmul.f32 q2, q2, q8 | |
876 vst1.32 {q0},[r0,:128]! | |
877 vmul.f32 q3, q3, q8 | |
878 vst1.32 {q1},[r0,:128]! | |
879 subs r12, r12, #16 | |
880 beq 2f | |
881 vld1.32 {q0},[r1,:128]! | |
882 vst1.32 {q2},[r0,:128]! | |
883 vld1.32 {q1},[r1,:128]! | |
884 vst1.32 {q3},[r0,:128]! | |
885 b 1b | |
886 2: vst1.32 {q2},[r0,:128]! | |
887 vst1.32 {q3},[r0,:128]! | |
888 ands len, len, #15 | |
889 bxeq lr | |
890 3: vld1.32 {q0},[r1,:128]! | |
891 vmul.f32 q0, q0, q8 | |
892 vst1.32 {q0},[r0,:128]! | |
893 subs len, len, #4 | |
894 bgt 3b | |
895 bx lr | |
896 .unreq len | |
897 .endfunc | |
898 | |
899 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
900 VFP vdup.32 d16, d0[0] | |
901 NOVFP vdup.32 d16, r3 | |
902 NOVFP ldr r3, [sp] | |
903 vld1.32 {d0},[r1,:64]! | |
904 vld1.32 {d1},[r1,:64]! | |
905 1: subs r3, r3, #4 | |
906 vmul.f32 d4, d0, d16 | |
907 vmul.f32 d5, d1, d16 | |
908 ldr r12, [r2], #4 | |
909 vld1.32 {d2},[r12,:64] | |
910 ldr r12, [r2], #4 | |
911 vld1.32 {d3},[r12,:64] | |
912 vmul.f32 d4, d4, d2 | |
913 vmul.f32 d5, d5, d3 | |
914 beq 2f | |
915 vld1.32 {d0},[r1,:64]! | |
916 vld1.32 {d1},[r1,:64]! | |
917 vst1.32 {d4},[r0,:64]! | |
918 vst1.32 {d5},[r0,:64]! | |
919 b 1b | |
920 2: vst1.32 {d4},[r0,:64]! | |
921 vst1.32 {d5},[r0,:64]! | |
922 bx lr | |
923 .endfunc | |
924 | |
925 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
926 VFP vdup.32 q10, d0[0] | |
927 NOVFP vdup.32 q10, r3 | |
928 NOVFP ldr r3, [sp] | |
929 push {lr} | |
930 bics lr, r3, #7 | |
931 beq 3f | |
932 vld1.32 {q0},[r1,:128]! | |
933 vld1.32 {q2},[r1,:128]! | |
934 1: ldr r12, [r2], #4 | |
935 vld1.32 {q1},[r12,:128] | |
936 ldr r12, [r2], #4 | |
937 vld1.32 {q3},[r12,:128] | |
938 vmul.f32 q8, q0, q10 | |
939 vmul.f32 q8, q8, q1 | |
940 vmul.f32 q9, q2, q10 | |
941 vmul.f32 q9, q9, q3 | |
942 subs lr, lr, #8 | |
943 beq 2f | |
944 vld1.32 {q0},[r1,:128]! | |
945 vld1.32 {q2},[r1,:128]! | |
946 vst1.32 {q8},[r0,:128]! | |
947 vst1.32 {q9},[r0,:128]! | |
948 b 1b | |
949 2: vst1.32 {q8},[r0,:128]! | |
950 vst1.32 {q9},[r0,:128]! | |
951 ands r3, r3, #7 | |
952 popeq {pc} | |
953 3: vld1.32 {q0},[r1,:128]! | |
954 ldr r12, [r2], #4 | |
955 vld1.32 {q1},[r12,:128] | |
956 vmul.f32 q0, q0, q10 | |
957 vmul.f32 q0, q0, q1 | |
958 vst1.32 {q0},[r0,:128]! | |
959 subs r3, r3, #4 | |
960 bgt 3b | |
961 pop {pc} | |
962 .endfunc | |
963 | |
964 function ff_sv_fmul_scalar_2_neon, export=1 | |
965 VFP len .req r2 | |
966 NOVFP len .req r3 | |
967 VFP vdup.32 q8, d0[0] | |
968 NOVFP vdup.32 q8, r2 | |
969 ldr r12, [r1], #4 | |
970 vld1.32 {d0},[r12,:64] | |
971 ldr r12, [r1], #4 | |
972 vld1.32 {d1},[r12,:64] | |
973 1: vmul.f32 q1, q0, q8 | |
974 subs len, len, #4 | |
975 beq 2f | |
976 ldr r12, [r1], #4 | |
977 vld1.32 {d0},[r12,:64] | |
978 ldr r12, [r1], #4 | |
979 vld1.32 {d1},[r12,:64] | |
980 vst1.32 {q1},[r0,:128]! | |
981 b 1b | |
982 2: vst1.32 {q1},[r0,:128]! | |
983 bx lr | |
984 .unreq len | |
985 .endfunc | |
986 | |
987 function ff_sv_fmul_scalar_4_neon, export=1 | |
988 VFP len .req r2 | |
989 NOVFP len .req r3 | |
990 VFP vdup.32 q8, d0[0] | |
991 NOVFP vdup.32 q8, r2 | |
992 1: ldr r12, [r1], #4 | |
993 vld1.32 {q0},[r12,:128] | |
994 vmul.f32 q0, q0, q8 | |
995 vst1.32 {q0},[r0,:128]! | |
996 subs len, len, #4 | |
997 bgt 1b | |
998 bx lr | |
999 .unreq len | |
1000 .endfunc | |
1001 | |
1002 function ff_butterflies_float_neon, export=1 | |
1003 1: vld1.32 {q0},[r0,:128] | |
1004 vld1.32 {q1},[r1,:128] | |
1005 vsub.f32 q2, q0, q1 | |
1006 vadd.f32 q1, q0, q1 | |
1007 vst1.32 {q2},[r1,:128]! | |
1008 vst1.32 {q1},[r0,:128]! | |
1009 subs r2, r2, #4 | |
1010 bgt 1b | |
1011 bx lr | |
1012 .endfunc | |
10228 | 1013 |
1014 function ff_scalarproduct_float_neon, export=1 | |
1015 vmov.f32 q2, #0.0 | |
1016 1: vld1.32 {q0},[r0,:128]! | |
1017 vld1.32 {q1},[r1,:128]! | |
1018 vmla.f32 q2, q0, q1 | |
1019 subs r2, r2, #4 | |
1020 bgt 1b | |
1021 vadd.f32 d0, d4, d5 | |
1022 vpadd.f32 d0, d0, d0 | |
1023 NOVFP vmov.32 r0, d0[0] | |
1024 bx lr | |
1025 .endfunc | |
10253 | 1026 |
1027 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1028 VFP vdup.32 q0, d0[0] | |
1029 VFP len .req r2 | |
1030 NOVFP vdup.32 q0, r2 | |
1031 NOVFP len .req r3 | |
1032 | |
1033 vld1.32 {q1},[r1,:128]! | |
1034 vcvt.f32.s32 q3, q1 | |
1035 vld1.32 {q2},[r1,:128]! | |
1036 vcvt.f32.s32 q8, q2 | |
1037 1: subs len, len, #8 | |
1038 pld [r1, #16] | |
1039 vmul.f32 q9, q3, q0 | |
1040 vmul.f32 q10, q8, q0 | |
1041 beq 2f | |
1042 vld1.32 {q1},[r1,:128]! | |
1043 vcvt.f32.s32 q3, q1 | |
1044 vld1.32 {q2},[r1,:128]! | |
1045 vcvt.f32.s32 q8, q2 | |
1046 vst1.32 {q9}, [r0,:128]! | |
1047 vst1.32 {q10},[r0,:128]! | |
1048 b 1b | |
1049 2: vst1.32 {q9}, [r0,:128]! | |
1050 vst1.32 {q10},[r0,:128]! | |
1051 bx lr | |
1052 .unreq len | |
1053 .endfunc |