Mercurial > libavcodec.hg
annotate arm/dsputil_neon.S @ 12497:c5ffa8b81f9c libavcodec
Move sse16_sse2() from inline asm to yasm. It is one of the functions causing
Win64/FATE issues.
author | rbultje |
---|---|
date | Fri, 17 Sep 2010 01:44:17 +0000 |
parents | 659f16d04776 |
children |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
10046 | 22 #include "config.h" |
8334 | 23 #include "asm.S" |
24 | |
25 preserve8 | |
26 .text | |
27 | |
11807 | 28 function ff_clear_block_neon, export=1 |
29 vmov.i16 q0, #0 | |
30 .rept 8 | |
31 vst1.16 {q0}, [r0,:128]! | |
32 .endr | |
33 bx lr | |
34 endfunc | |
35 | |
36 function ff_clear_blocks_neon, export=1 | |
37 vmov.i16 q0, #0 | |
38 .rept 8*6 | |
39 vst1.16 {q0}, [r0,:128]! | |
40 .endr | |
41 bx lr | |
42 endfunc | |
43 | |
8334 | 44 .macro pixels16 avg=0 |
45 .if \avg | |
46 mov ip, r0 | |
47 .endif | |
48 1: vld1.64 {d0, d1}, [r1], r2 | |
49 vld1.64 {d2, d3}, [r1], r2 | |
50 vld1.64 {d4, d5}, [r1], r2 | |
51 pld [r1, r2, lsl #2] | |
52 vld1.64 {d6, d7}, [r1], r2 | |
53 pld [r1] | |
54 pld [r1, r2] | |
55 pld [r1, r2, lsl #1] | |
56 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
57 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 58 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
59 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 60 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
61 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 62 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
63 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 64 vrhadd.u8 q3, q3, q11 |
65 .endif | |
66 subs r3, r3, #4 | |
67 vst1.64 {d0, d1}, [r0,:128], r2 | |
68 vst1.64 {d2, d3}, [r0,:128], r2 | |
69 vst1.64 {d4, d5}, [r0,:128], r2 | |
70 vst1.64 {d6, d7}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_x2 vhadd=vrhadd.u8 | |
76 1: vld1.64 {d0-d2}, [r1], r2 | |
77 vld1.64 {d4-d6}, [r1], r2 | |
78 pld [r1] | |
79 pld [r1, r2] | |
80 subs r3, r3, #2 | |
81 vext.8 q1, q0, q1, #1 | |
82 \vhadd q0, q0, q1 | |
83 vext.8 q3, q2, q3, #1 | |
84 \vhadd q2, q2, q3 | |
85 vst1.64 {d0, d1}, [r0,:128], r2 | |
86 vst1.64 {d4, d5}, [r0,:128], r2 | |
87 bne 1b | |
88 bx lr | |
89 .endm | |
90 | |
91 .macro pixels16_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
92 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d2, d3}, [r1], r2 |
8334 | 94 1: subs r3, r3, #2 |
95 \vhadd q2, q0, q1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
96 vld1.64 {d0, d1}, [r1], r2 |
8334 | 97 \vhadd q3, q0, q1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
98 vld1.64 {d2, d3}, [r1], r2 |
8334 | 99 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
100 pld [r1, r2] |
8334 | 101 vst1.64 {d4, d5}, [r0,:128], r2 |
102 vst1.64 {d6, d7}, [r0,:128], r2 | |
103 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
104 bx lr |
8334 | 105 .endm |
106 | |
107 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
108 vld1.64 {d0-d2}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
109 vld1.64 {d4-d6}, [r1], r2 |
8334 | 110 .if \no_rnd |
111 vmov.i16 q13, #1 | |
112 .endif | |
113 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
114 pld [r1, r2] |
8334 | 115 vext.8 q1, q0, q1, #1 |
116 vext.8 q3, q2, q3, #1 | |
117 vaddl.u8 q8, d0, d2 | |
118 vaddl.u8 q10, d1, d3 | |
119 vaddl.u8 q9, d4, d6 | |
120 vaddl.u8 q11, d5, d7 | |
121 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
122 vld1.64 {d0-d2}, [r1], r2 |
8334 | 123 vadd.u16 q12, q8, q9 |
124 pld [r1] | |
125 .if \no_rnd | |
126 vadd.u16 q12, q12, q13 | |
127 .endif | |
128 vext.8 q15, q0, q1, #1 | |
129 vadd.u16 q1 , q10, q11 | |
130 \vshrn d28, q12, #2 | |
131 .if \no_rnd | |
132 vadd.u16 q1, q1, q13 | |
133 .endif | |
134 \vshrn d29, q1, #2 | |
135 vaddl.u8 q8, d0, d30 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
136 vld1.64 {d2-d4}, [r1], r2 |
8334 | 137 vaddl.u8 q10, d1, d31 |
138 vst1.64 {d28,d29}, [r0,:128], r2 | |
139 vadd.u16 q12, q8, q9 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
140 pld [r1, r2] |
8334 | 141 .if \no_rnd |
142 vadd.u16 q12, q12, q13 | |
143 .endif | |
144 vext.8 q2, q1, q2, #1 | |
145 vadd.u16 q0, q10, q11 | |
146 \vshrn d30, q12, #2 | |
147 .if \no_rnd | |
148 vadd.u16 q0, q0, q13 | |
149 .endif | |
150 \vshrn d31, q0, #2 | |
151 vaddl.u8 q9, d2, d4 | |
152 vaddl.u8 q11, d3, d5 | |
153 vst1.64 {d30,d31}, [r0,:128], r2 | |
154 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
155 bx lr |
8334 | 156 .endm |
157 | |
10375 | 158 .macro pixels8 avg=0 |
8334 | 159 1: vld1.64 {d0}, [r1], r2 |
160 vld1.64 {d1}, [r1], r2 | |
161 vld1.64 {d2}, [r1], r2 | |
162 pld [r1, r2, lsl #2] | |
163 vld1.64 {d3}, [r1], r2 | |
164 pld [r1] | |
165 pld [r1, r2] | |
166 pld [r1, r2, lsl #1] | |
10375 | 167 .if \avg |
168 vld1.64 {d4}, [r0,:64], r2 | |
169 vrhadd.u8 d0, d0, d4 | |
170 vld1.64 {d5}, [r0,:64], r2 | |
171 vrhadd.u8 d1, d1, d5 | |
172 vld1.64 {d6}, [r0,:64], r2 | |
173 vrhadd.u8 d2, d2, d6 | |
174 vld1.64 {d7}, [r0,:64], r2 | |
175 vrhadd.u8 d3, d3, d7 | |
176 sub r0, r0, r2, lsl #2 | |
177 .endif | |
8334 | 178 subs r3, r3, #4 |
179 vst1.64 {d0}, [r0,:64], r2 | |
180 vst1.64 {d1}, [r0,:64], r2 | |
181 vst1.64 {d2}, [r0,:64], r2 | |
182 vst1.64 {d3}, [r0,:64], r2 | |
183 bne 1b | |
184 bx lr | |
185 .endm | |
186 | |
187 .macro pixels8_x2 vhadd=vrhadd.u8 | |
188 1: vld1.64 {d0, d1}, [r1], r2 | |
189 vext.8 d1, d0, d1, #1 | |
190 vld1.64 {d2, d3}, [r1], r2 | |
191 vext.8 d3, d2, d3, #1 | |
192 pld [r1] | |
193 pld [r1, r2] | |
194 subs r3, r3, #2 | |
195 vswp d1, d2 | |
196 \vhadd q0, q0, q1 | |
197 vst1.64 {d0}, [r0,:64], r2 | |
198 vst1.64 {d1}, [r0,:64], r2 | |
199 bne 1b | |
200 bx lr | |
201 .endm | |
202 | |
203 .macro pixels8_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
204 vld1.64 {d0}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
205 vld1.64 {d1}, [r1], r2 |
8334 | 206 1: subs r3, r3, #2 |
207 \vhadd d4, d0, d1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
208 vld1.64 {d0}, [r1], r2 |
8334 | 209 \vhadd d5, d0, d1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
210 vld1.64 {d1}, [r1], r2 |
8334 | 211 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
212 pld [r1, r2] |
8334 | 213 vst1.64 {d4}, [r0,:64], r2 |
214 vst1.64 {d5}, [r0,:64], r2 | |
215 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
216 bx lr |
8334 | 217 .endm |
218 | |
219 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
220 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
221 vld1.64 {d2, d3}, [r1], r2 |
8334 | 222 .if \no_rnd |
223 vmov.i16 q11, #1 | |
224 .endif | |
225 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
226 pld [r1, r2] |
8334 | 227 vext.8 d4, d0, d1, #1 |
228 vext.8 d6, d2, d3, #1 | |
229 vaddl.u8 q8, d0, d4 | |
230 vaddl.u8 q9, d2, d6 | |
231 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
232 vld1.64 {d0, d1}, [r1], r2 |
8334 | 233 pld [r1] |
234 vadd.u16 q10, q8, q9 | |
235 vext.8 d4, d0, d1, #1 | |
236 .if \no_rnd | |
237 vadd.u16 q10, q10, q11 | |
238 .endif | |
239 vaddl.u8 q8, d0, d4 | |
240 \vshrn d5, q10, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
241 vld1.64 {d2, d3}, [r1], r2 |
8334 | 242 vadd.u16 q10, q8, q9 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
243 pld [r1, r2] |
8334 | 244 .if \no_rnd |
245 vadd.u16 q10, q10, q11 | |
246 .endif | |
247 vst1.64 {d5}, [r0,:64], r2 | |
248 \vshrn d7, q10, #2 | |
249 vext.8 d6, d2, d3, #1 | |
250 vaddl.u8 q9, d2, d6 | |
251 vst1.64 {d7}, [r0,:64], r2 | |
252 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
253 bx lr |
8334 | 254 .endm |
255 | |
256 .macro pixfunc pfx name suf rnd_op args:vararg | |
257 function ff_\pfx\name\suf\()_neon, export=1 | |
258 \name \rnd_op \args | |
11443 | 259 endfunc |
8334 | 260 .endm |
261 | |
262 .macro pixfunc2 pfx name args:vararg | |
263 pixfunc \pfx \name | |
264 pixfunc \pfx \name \args | |
265 .endm | |
266 | |
267 function ff_put_h264_qpel16_mc00_neon, export=1 | |
10376 | 268 mov r3, #16 |
11443 | 269 endfunc |
8334 | 270 |
271 pixfunc put_ pixels16 | |
272 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
274 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
275 | |
276 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
10376 | 277 mov r3, #16 |
11443 | 278 endfunc |
8334 | 279 |
280 pixfunc avg_ pixels16,, 1 | |
281 | |
282 function ff_put_h264_qpel8_mc00_neon, export=1 | |
10376 | 283 mov r3, #8 |
11443 | 284 endfunc |
8334 | 285 |
286 pixfunc put_ pixels8 | |
287 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
288 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
289 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 290 |
10375 | 291 function ff_avg_h264_qpel8_mc00_neon, export=1 |
292 mov r3, #8 | |
11443 | 293 endfunc |
10375 | 294 |
295 pixfunc avg_ pixels8,, 1 | |
296 | |
9580 | 297 function ff_put_pixels_clamped_neon, export=1 |
298 vld1.64 {d16-d19}, [r0,:128]! | |
299 vqmovun.s16 d0, q8 | |
300 vld1.64 {d20-d23}, [r0,:128]! | |
301 vqmovun.s16 d1, q9 | |
302 vld1.64 {d24-d27}, [r0,:128]! | |
303 vqmovun.s16 d2, q10 | |
304 vld1.64 {d28-d31}, [r0,:128]! | |
305 vqmovun.s16 d3, q11 | |
306 vst1.64 {d0}, [r1,:64], r2 | |
307 vqmovun.s16 d4, q12 | |
308 vst1.64 {d1}, [r1,:64], r2 | |
309 vqmovun.s16 d5, q13 | |
310 vst1.64 {d2}, [r1,:64], r2 | |
311 vqmovun.s16 d6, q14 | |
312 vst1.64 {d3}, [r1,:64], r2 | |
313 vqmovun.s16 d7, q15 | |
314 vst1.64 {d4}, [r1,:64], r2 | |
315 vst1.64 {d5}, [r1,:64], r2 | |
316 vst1.64 {d6}, [r1,:64], r2 | |
317 vst1.64 {d7}, [r1,:64], r2 | |
318 bx lr | |
11443 | 319 endfunc |
9580 | 320 |
9345 | 321 function ff_put_signed_pixels_clamped_neon, export=1 |
322 vmov.u8 d31, #128 | |
323 vld1.64 {d16-d17}, [r0,:128]! | |
324 vqmovn.s16 d0, q8 | |
325 vld1.64 {d18-d19}, [r0,:128]! | |
326 vqmovn.s16 d1, q9 | |
327 vld1.64 {d16-d17}, [r0,:128]! | |
328 vqmovn.s16 d2, q8 | |
329 vld1.64 {d18-d19}, [r0,:128]! | |
330 vadd.u8 d0, d0, d31 | |
331 vld1.64 {d20-d21}, [r0,:128]! | |
332 vadd.u8 d1, d1, d31 | |
333 vld1.64 {d22-d23}, [r0,:128]! | |
334 vadd.u8 d2, d2, d31 | |
335 vst1.64 {d0}, [r1,:64], r2 | |
336 vqmovn.s16 d3, q9 | |
337 vst1.64 {d1}, [r1,:64], r2 | |
338 vqmovn.s16 d4, q10 | |
339 vst1.64 {d2}, [r1,:64], r2 | |
340 vqmovn.s16 d5, q11 | |
341 vld1.64 {d24-d25}, [r0,:128]! | |
342 vadd.u8 d3, d3, d31 | |
343 vld1.64 {d26-d27}, [r0,:128]! | |
344 vadd.u8 d4, d4, d31 | |
345 vadd.u8 d5, d5, d31 | |
346 vst1.64 {d3}, [r1,:64], r2 | |
347 vqmovn.s16 d6, q12 | |
348 vst1.64 {d4}, [r1,:64], r2 | |
349 vqmovn.s16 d7, q13 | |
350 vst1.64 {d5}, [r1,:64], r2 | |
351 vadd.u8 d6, d6, d31 | |
352 vadd.u8 d7, d7, d31 | |
353 vst1.64 {d6}, [r1,:64], r2 | |
354 vst1.64 {d7}, [r1,:64], r2 | |
355 bx lr | |
11443 | 356 endfunc |
9345 | 357 |
9344 | 358 function ff_add_pixels_clamped_neon, export=1 |
359 mov r3, r1 | |
360 vld1.64 {d16}, [r1,:64], r2 | |
361 vld1.64 {d0-d1}, [r0,:128]! | |
362 vaddw.u8 q0, q0, d16 | |
363 vld1.64 {d17}, [r1,:64], r2 | |
364 vld1.64 {d2-d3}, [r0,:128]! | |
365 vqmovun.s16 d0, q0 | |
366 vld1.64 {d18}, [r1,:64], r2 | |
367 vaddw.u8 q1, q1, d17 | |
368 vld1.64 {d4-d5}, [r0,:128]! | |
369 vaddw.u8 q2, q2, d18 | |
370 vst1.64 {d0}, [r3,:64], r2 | |
371 vqmovun.s16 d2, q1 | |
372 vld1.64 {d19}, [r1,:64], r2 | |
373 vld1.64 {d6-d7}, [r0,:128]! | |
374 vaddw.u8 q3, q3, d19 | |
375 vqmovun.s16 d4, q2 | |
376 vst1.64 {d2}, [r3,:64], r2 | |
377 vld1.64 {d16}, [r1,:64], r2 | |
378 vqmovun.s16 d6, q3 | |
379 vld1.64 {d0-d1}, [r0,:128]! | |
380 vaddw.u8 q0, q0, d16 | |
381 vst1.64 {d4}, [r3,:64], r2 | |
382 vld1.64 {d17}, [r1,:64], r2 | |
383 vld1.64 {d2-d3}, [r0,:128]! | |
384 vaddw.u8 q1, q1, d17 | |
385 vst1.64 {d6}, [r3,:64], r2 | |
386 vqmovun.s16 d0, q0 | |
387 vld1.64 {d18}, [r1,:64], r2 | |
388 vld1.64 {d4-d5}, [r0,:128]! | |
389 vaddw.u8 q2, q2, d18 | |
390 vst1.64 {d0}, [r3,:64], r2 | |
391 vqmovun.s16 d2, q1 | |
392 vld1.64 {d19}, [r1,:64], r2 | |
393 vqmovun.s16 d4, q2 | |
394 vld1.64 {d6-d7}, [r0,:128]! | |
395 vaddw.u8 q3, q3, d19 | |
396 vst1.64 {d2}, [r3,:64], r2 | |
397 vqmovun.s16 d6, q3 | |
398 vst1.64 {d4}, [r3,:64], r2 | |
399 vst1.64 {d6}, [r3,:64], r2 | |
400 bx lr | |
11443 | 401 endfunc |
9344 | 402 |
8492 | 403 function ff_float_to_int16_neon, export=1 |
404 subs r2, r2, #8 | |
405 vld1.64 {d0-d1}, [r1,:128]! | |
406 vcvt.s32.f32 q8, q0, #16 | |
407 vld1.64 {d2-d3}, [r1,:128]! | |
408 vcvt.s32.f32 q9, q1, #16 | |
409 beq 3f | |
410 bics ip, r2, #15 | |
411 beq 2f | |
412 1: subs ip, ip, #16 | |
413 vshrn.s32 d4, q8, #16 | |
414 vld1.64 {d0-d1}, [r1,:128]! | |
415 vcvt.s32.f32 q0, q0, #16 | |
416 vshrn.s32 d5, q9, #16 | |
417 vld1.64 {d2-d3}, [r1,:128]! | |
418 vcvt.s32.f32 q1, q1, #16 | |
419 vshrn.s32 d6, q0, #16 | |
420 vst1.64 {d4-d5}, [r0,:128]! | |
421 vshrn.s32 d7, q1, #16 | |
422 vld1.64 {d16-d17},[r1,:128]! | |
423 vcvt.s32.f32 q8, q8, #16 | |
424 vld1.64 {d18-d19},[r1,:128]! | |
425 vcvt.s32.f32 q9, q9, #16 | |
426 vst1.64 {d6-d7}, [r0,:128]! | |
427 bne 1b | |
428 ands r2, r2, #15 | |
429 beq 3f | |
430 2: vld1.64 {d0-d1}, [r1,:128]! | |
431 vshrn.s32 d4, q8, #16 | |
432 vcvt.s32.f32 q0, q0, #16 | |
433 vld1.64 {d2-d3}, [r1,:128]! | |
434 vshrn.s32 d5, q9, #16 | |
435 vcvt.s32.f32 q1, q1, #16 | |
436 vshrn.s32 d6, q0, #16 | |
437 vst1.64 {d4-d5}, [r0,:128]! | |
438 vshrn.s32 d7, q1, #16 | |
439 vst1.64 {d6-d7}, [r0,:128]! | |
440 bx lr | |
441 3: vshrn.s32 d4, q8, #16 | |
442 vshrn.s32 d5, q9, #16 | |
443 vst1.64 {d4-d5}, [r0,:128]! | |
444 bx lr | |
11443 | 445 endfunc |
8492 | 446 |
447 function ff_float_to_int16_interleave_neon, export=1 | |
448 cmp r3, #2 | |
449 ldrlt r1, [r1] | |
450 blt ff_float_to_int16_neon | |
451 bne 4f | |
452 | |
453 ldr r3, [r1] | |
454 ldr r1, [r1, #4] | |
455 | |
456 subs r2, r2, #8 | |
457 vld1.64 {d0-d1}, [r3,:128]! | |
458 vcvt.s32.f32 q8, q0, #16 | |
459 vld1.64 {d2-d3}, [r3,:128]! | |
460 vcvt.s32.f32 q9, q1, #16 | |
461 vld1.64 {d20-d21},[r1,:128]! | |
462 vcvt.s32.f32 q10, q10, #16 | |
463 vld1.64 {d22-d23},[r1,:128]! | |
464 vcvt.s32.f32 q11, q11, #16 | |
465 beq 3f | |
466 bics ip, r2, #15 | |
467 beq 2f | |
468 1: subs ip, ip, #16 | |
469 vld1.64 {d0-d1}, [r3,:128]! | |
470 vcvt.s32.f32 q0, q0, #16 | |
471 vsri.32 q10, q8, #16 | |
472 vld1.64 {d2-d3}, [r3,:128]! | |
473 vcvt.s32.f32 q1, q1, #16 | |
474 vld1.64 {d24-d25},[r1,:128]! | |
475 vcvt.s32.f32 q12, q12, #16 | |
476 vld1.64 {d26-d27},[r1,:128]! | |
477 vsri.32 q11, q9, #16 | |
478 vst1.64 {d20-d21},[r0,:128]! | |
479 vcvt.s32.f32 q13, q13, #16 | |
480 vst1.64 {d22-d23},[r0,:128]! | |
481 vsri.32 q12, q0, #16 | |
482 vld1.64 {d16-d17},[r3,:128]! | |
483 vsri.32 q13, q1, #16 | |
484 vst1.64 {d24-d25},[r0,:128]! | |
485 vcvt.s32.f32 q8, q8, #16 | |
486 vld1.64 {d18-d19},[r3,:128]! | |
487 vcvt.s32.f32 q9, q9, #16 | |
488 vld1.64 {d20-d21},[r1,:128]! | |
489 vcvt.s32.f32 q10, q10, #16 | |
490 vld1.64 {d22-d23},[r1,:128]! | |
491 vcvt.s32.f32 q11, q11, #16 | |
492 vst1.64 {d26-d27},[r0,:128]! | |
493 bne 1b | |
494 ands r2, r2, #15 | |
495 beq 3f | |
496 2: vsri.32 q10, q8, #16 | |
497 vld1.64 {d0-d1}, [r3,:128]! | |
498 vcvt.s32.f32 q0, q0, #16 | |
499 vld1.64 {d2-d3}, [r3,:128]! | |
500 vcvt.s32.f32 q1, q1, #16 | |
501 vld1.64 {d24-d25},[r1,:128]! | |
502 vcvt.s32.f32 q12, q12, #16 | |
503 vsri.32 q11, q9, #16 | |
504 vld1.64 {d26-d27},[r1,:128]! | |
505 vcvt.s32.f32 q13, q13, #16 | |
506 vst1.64 {d20-d21},[r0,:128]! | |
507 vsri.32 q12, q0, #16 | |
508 vst1.64 {d22-d23},[r0,:128]! | |
509 vsri.32 q13, q1, #16 | |
510 vst1.64 {d24-d27},[r0,:128]! | |
511 bx lr | |
512 3: vsri.32 q10, q8, #16 | |
513 vsri.32 q11, q9, #16 | |
514 vst1.64 {d20-d23},[r0,:128]! | |
515 bx lr | |
516 | |
517 4: push {r4-r8,lr} | |
518 cmp r3, #4 | |
519 lsl ip, r3, #1 | |
520 blt 4f | |
521 | |
522 @ 4 channels | |
523 5: ldmia r1!, {r4-r7} | |
524 mov lr, r2 | |
525 mov r8, r0 | |
526 vld1.64 {d16-d17},[r4,:128]! | |
527 vcvt.s32.f32 q8, q8, #16 | |
528 vld1.64 {d18-d19},[r5,:128]! | |
529 vcvt.s32.f32 q9, q9, #16 | |
530 vld1.64 {d20-d21},[r6,:128]! | |
531 vcvt.s32.f32 q10, q10, #16 | |
532 vld1.64 {d22-d23},[r7,:128]! | |
533 vcvt.s32.f32 q11, q11, #16 | |
534 6: subs lr, lr, #8 | |
535 vld1.64 {d0-d1}, [r4,:128]! | |
536 vcvt.s32.f32 q0, q0, #16 | |
537 vsri.32 q9, q8, #16 | |
538 vld1.64 {d2-d3}, [r5,:128]! | |
539 vcvt.s32.f32 q1, q1, #16 | |
540 vsri.32 q11, q10, #16 | |
541 vld1.64 {d4-d5}, [r6,:128]! | |
542 vcvt.s32.f32 q2, q2, #16 | |
543 vzip.32 d18, d22 | |
544 vld1.64 {d6-d7}, [r7,:128]! | |
545 vcvt.s32.f32 q3, q3, #16 | |
546 vzip.32 d19, d23 | |
547 vst1.64 {d18}, [r8], ip | |
548 vsri.32 q1, q0, #16 | |
549 vst1.64 {d22}, [r8], ip | |
550 vsri.32 q3, q2, #16 | |
551 vst1.64 {d19}, [r8], ip | |
552 vzip.32 d2, d6 | |
553 vst1.64 {d23}, [r8], ip | |
554 vzip.32 d3, d7 | |
555 beq 7f | |
556 vld1.64 {d16-d17},[r4,:128]! | |
557 vcvt.s32.f32 q8, q8, #16 | |
558 vst1.64 {d2}, [r8], ip | |
559 vld1.64 {d18-d19},[r5,:128]! | |
560 vcvt.s32.f32 q9, q9, #16 | |
561 vst1.64 {d6}, [r8], ip | |
562 vld1.64 {d20-d21},[r6,:128]! | |
563 vcvt.s32.f32 q10, q10, #16 | |
564 vst1.64 {d3}, [r8], ip | |
565 vld1.64 {d22-d23},[r7,:128]! | |
566 vcvt.s32.f32 q11, q11, #16 | |
567 vst1.64 {d7}, [r8], ip | |
568 b 6b | |
569 7: vst1.64 {d2}, [r8], ip | |
570 vst1.64 {d6}, [r8], ip | |
571 vst1.64 {d3}, [r8], ip | |
572 vst1.64 {d7}, [r8], ip | |
573 subs r3, r3, #4 | |
574 popeq {r4-r8,pc} | |
575 cmp r3, #4 | |
576 add r0, r0, #8 | |
577 bge 5b | |
578 | |
579 @ 2 channels | |
580 4: cmp r3, #2 | |
581 blt 4f | |
582 ldmia r1!, {r4-r5} | |
583 mov lr, r2 | |
584 mov r8, r0 | |
585 tst lr, #8 | |
586 vld1.64 {d16-d17},[r4,:128]! | |
587 vcvt.s32.f32 q8, q8, #16 | |
588 vld1.64 {d18-d19},[r5,:128]! | |
589 vcvt.s32.f32 q9, q9, #16 | |
590 vld1.64 {d20-d21},[r4,:128]! | |
591 vcvt.s32.f32 q10, q10, #16 | |
592 vld1.64 {d22-d23},[r5,:128]! | |
593 vcvt.s32.f32 q11, q11, #16 | |
594 beq 6f | |
595 subs lr, lr, #8 | |
596 beq 7f | |
597 vsri.32 d18, d16, #16 | |
598 vsri.32 d19, d17, #16 | |
599 vld1.64 {d16-d17},[r4,:128]! | |
600 vcvt.s32.f32 q8, q8, #16 | |
601 vst1.32 {d18[0]}, [r8], ip | |
602 vsri.32 d22, d20, #16 | |
603 vst1.32 {d18[1]}, [r8], ip | |
604 vsri.32 d23, d21, #16 | |
605 vst1.32 {d19[0]}, [r8], ip | |
606 vst1.32 {d19[1]}, [r8], ip | |
607 vld1.64 {d18-d19},[r5,:128]! | |
608 vcvt.s32.f32 q9, q9, #16 | |
609 vst1.32 {d22[0]}, [r8], ip | |
610 vst1.32 {d22[1]}, [r8], ip | |
611 vld1.64 {d20-d21},[r4,:128]! | |
612 vcvt.s32.f32 q10, q10, #16 | |
613 vst1.32 {d23[0]}, [r8], ip | |
614 vst1.32 {d23[1]}, [r8], ip | |
615 vld1.64 {d22-d23},[r5,:128]! | |
616 vcvt.s32.f32 q11, q11, #16 | |
617 6: subs lr, lr, #16 | |
618 vld1.64 {d0-d1}, [r4,:128]! | |
619 vcvt.s32.f32 q0, q0, #16 | |
620 vsri.32 d18, d16, #16 | |
621 vld1.64 {d2-d3}, [r5,:128]! | |
622 vcvt.s32.f32 q1, q1, #16 | |
623 vsri.32 d19, d17, #16 | |
624 vld1.64 {d4-d5}, [r4,:128]! | |
625 vcvt.s32.f32 q2, q2, #16 | |
626 vld1.64 {d6-d7}, [r5,:128]! | |
627 vcvt.s32.f32 q3, q3, #16 | |
628 vst1.32 {d18[0]}, [r8], ip | |
629 vsri.32 d22, d20, #16 | |
630 vst1.32 {d18[1]}, [r8], ip | |
631 vsri.32 d23, d21, #16 | |
632 vst1.32 {d19[0]}, [r8], ip | |
633 vsri.32 d2, d0, #16 | |
634 vst1.32 {d19[1]}, [r8], ip | |
635 vsri.32 d3, d1, #16 | |
636 vst1.32 {d22[0]}, [r8], ip | |
637 vsri.32 d6, d4, #16 | |
638 vst1.32 {d22[1]}, [r8], ip | |
639 vsri.32 d7, d5, #16 | |
640 vst1.32 {d23[0]}, [r8], ip | |
641 vst1.32 {d23[1]}, [r8], ip | |
642 beq 6f | |
643 vld1.64 {d16-d17},[r4,:128]! | |
644 vcvt.s32.f32 q8, q8, #16 | |
645 vst1.32 {d2[0]}, [r8], ip | |
646 vst1.32 {d2[1]}, [r8], ip | |
647 vld1.64 {d18-d19},[r5,:128]! | |
648 vcvt.s32.f32 q9, q9, #16 | |
649 vst1.32 {d3[0]}, [r8], ip | |
650 vst1.32 {d3[1]}, [r8], ip | |
651 vld1.64 {d20-d21},[r4,:128]! | |
652 vcvt.s32.f32 q10, q10, #16 | |
653 vst1.32 {d6[0]}, [r8], ip | |
654 vst1.32 {d6[1]}, [r8], ip | |
655 vld1.64 {d22-d23},[r5,:128]! | |
656 vcvt.s32.f32 q11, q11, #16 | |
657 vst1.32 {d7[0]}, [r8], ip | |
658 vst1.32 {d7[1]}, [r8], ip | |
659 bgt 6b | |
660 6: vst1.32 {d2[0]}, [r8], ip | |
661 vst1.32 {d2[1]}, [r8], ip | |
662 vst1.32 {d3[0]}, [r8], ip | |
663 vst1.32 {d3[1]}, [r8], ip | |
664 vst1.32 {d6[0]}, [r8], ip | |
665 vst1.32 {d6[1]}, [r8], ip | |
666 vst1.32 {d7[0]}, [r8], ip | |
667 vst1.32 {d7[1]}, [r8], ip | |
668 b 8f | |
669 7: vsri.32 d18, d16, #16 | |
670 vsri.32 d19, d17, #16 | |
671 vst1.32 {d18[0]}, [r8], ip | |
672 vsri.32 d22, d20, #16 | |
673 vst1.32 {d18[1]}, [r8], ip | |
674 vsri.32 d23, d21, #16 | |
675 vst1.32 {d19[0]}, [r8], ip | |
676 vst1.32 {d19[1]}, [r8], ip | |
677 vst1.32 {d22[0]}, [r8], ip | |
678 vst1.32 {d22[1]}, [r8], ip | |
679 vst1.32 {d23[0]}, [r8], ip | |
680 vst1.32 {d23[1]}, [r8], ip | |
681 8: subs r3, r3, #2 | |
682 add r0, r0, #4 | |
683 popeq {r4-r8,pc} | |
684 | |
685 @ 1 channel | |
686 4: ldr r4, [r1],#4 | |
687 tst r2, #8 | |
688 mov lr, r2 | |
689 mov r5, r0 | |
690 vld1.64 {d0-d1}, [r4,:128]! | |
691 vcvt.s32.f32 q0, q0, #16 | |
692 vld1.64 {d2-d3}, [r4,:128]! | |
693 vcvt.s32.f32 q1, q1, #16 | |
694 bne 8f | |
695 6: subs lr, lr, #16 | |
696 vld1.64 {d4-d5}, [r4,:128]! | |
697 vcvt.s32.f32 q2, q2, #16 | |
698 vld1.64 {d6-d7}, [r4,:128]! | |
699 vcvt.s32.f32 q3, q3, #16 | |
700 vst1.16 {d0[1]}, [r5,:16], ip | |
701 vst1.16 {d0[3]}, [r5,:16], ip | |
702 vst1.16 {d1[1]}, [r5,:16], ip | |
703 vst1.16 {d1[3]}, [r5,:16], ip | |
704 vst1.16 {d2[1]}, [r5,:16], ip | |
705 vst1.16 {d2[3]}, [r5,:16], ip | |
706 vst1.16 {d3[1]}, [r5,:16], ip | |
707 vst1.16 {d3[3]}, [r5,:16], ip | |
708 beq 7f | |
709 vld1.64 {d0-d1}, [r4,:128]! | |
710 vcvt.s32.f32 q0, q0, #16 | |
711 vld1.64 {d2-d3}, [r4,:128]! | |
712 vcvt.s32.f32 q1, q1, #16 | |
713 7: vst1.16 {d4[1]}, [r5,:16], ip | |
714 vst1.16 {d4[3]}, [r5,:16], ip | |
715 vst1.16 {d5[1]}, [r5,:16], ip | |
716 vst1.16 {d5[3]}, [r5,:16], ip | |
717 vst1.16 {d6[1]}, [r5,:16], ip | |
718 vst1.16 {d6[3]}, [r5,:16], ip | |
719 vst1.16 {d7[1]}, [r5,:16], ip | |
720 vst1.16 {d7[3]}, [r5,:16], ip | |
721 bgt 6b | |
722 pop {r4-r8,pc} | |
723 8: subs lr, lr, #8 | |
724 vst1.16 {d0[1]}, [r5,:16], ip | |
725 vst1.16 {d0[3]}, [r5,:16], ip | |
726 vst1.16 {d1[1]}, [r5,:16], ip | |
727 vst1.16 {d1[3]}, [r5,:16], ip | |
728 vst1.16 {d2[1]}, [r5,:16], ip | |
729 vst1.16 {d2[3]}, [r5,:16], ip | |
730 vst1.16 {d3[1]}, [r5,:16], ip | |
731 vst1.16 {d3[3]}, [r5,:16], ip | |
732 popeq {r4-r8,pc} | |
733 vld1.64 {d0-d1}, [r4,:128]! | |
734 vcvt.s32.f32 q0, q0, #16 | |
735 vld1.64 {d2-d3}, [r4,:128]! | |
736 vcvt.s32.f32 q1, q1, #16 | |
737 b 6b | |
11443 | 738 endfunc |
8697 | 739 |
740 function ff_vector_fmul_neon, export=1 | |
741 mov r3, r0 | |
742 subs r2, r2, #8 | |
743 vld1.64 {d0-d3}, [r0,:128]! | |
744 vld1.64 {d4-d7}, [r1,:128]! | |
745 vmul.f32 q8, q0, q2 | |
746 vmul.f32 q9, q1, q3 | |
747 beq 3f | |
748 bics ip, r2, #15 | |
749 beq 2f | |
750 1: subs ip, ip, #16 | |
751 vld1.64 {d0-d1}, [r0,:128]! | |
752 vld1.64 {d4-d5}, [r1,:128]! | |
753 vmul.f32 q10, q0, q2 | |
754 vld1.64 {d2-d3}, [r0,:128]! | |
755 vld1.64 {d6-d7}, [r1,:128]! | |
756 vmul.f32 q11, q1, q3 | |
757 vst1.64 {d16-d19},[r3,:128]! | |
758 vld1.64 {d0-d1}, [r0,:128]! | |
759 vld1.64 {d4-d5}, [r1,:128]! | |
760 vmul.f32 q8, q0, q2 | |
761 vld1.64 {d2-d3}, [r0,:128]! | |
762 vld1.64 {d6-d7}, [r1,:128]! | |
763 vmul.f32 q9, q1, q3 | |
764 vst1.64 {d20-d23},[r3,:128]! | |
765 bne 1b | |
766 ands r2, r2, #15 | |
767 beq 3f | |
768 2: vld1.64 {d0-d1}, [r0,:128]! | |
769 vld1.64 {d4-d5}, [r1,:128]! | |
770 vst1.64 {d16-d17},[r3,:128]! | |
771 vmul.f32 q8, q0, q2 | |
772 vld1.64 {d2-d3}, [r0,:128]! | |
773 vld1.64 {d6-d7}, [r1,:128]! | |
774 vst1.64 {d18-d19},[r3,:128]! | |
775 vmul.f32 q9, q1, q3 | |
776 3: vst1.64 {d16-d19},[r3,:128]! | |
777 bx lr | |
11443 | 778 endfunc |
8698 | 779 |
780 function ff_vector_fmul_window_neon, export=1 | |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
781 VFP vdup.32 q8, d0[0] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
782 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
8698 | 783 push {r4,r5,lr} |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
784 VFP ldr lr, [sp, #12] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
785 NOVFP ldr lr, [sp, #16] |
8698 | 786 sub r2, r2, #8 |
787 sub r5, lr, #2 | |
788 add r2, r2, r5, lsl #2 | |
789 add r4, r3, r5, lsl #3 | |
790 add ip, r0, r5, lsl #3 | |
791 mov r5, #-16 | |
792 vld1.64 {d0,d1}, [r1,:128]! | |
793 vld1.64 {d2,d3}, [r2,:128], r5 | |
794 vld1.64 {d4,d5}, [r3,:128]! | |
795 vld1.64 {d6,d7}, [r4,:128], r5 | |
796 1: subs lr, lr, #4 | |
797 vmov q11, q8 | |
798 vmla.f32 d22, d0, d4 | |
799 vmov q10, q8 | |
800 vmla.f32 d23, d1, d5 | |
801 vrev64.32 q3, q3 | |
802 vmla.f32 d20, d0, d7 | |
803 vrev64.32 q1, q1 | |
804 vmla.f32 d21, d1, d6 | |
805 beq 2f | |
806 vmla.f32 d22, d3, d7 | |
807 vld1.64 {d0,d1}, [r1,:128]! | |
808 vmla.f32 d23, d2, d6 | |
809 vld1.64 {d18,d19},[r2,:128], r5 | |
810 vmls.f32 d20, d3, d4 | |
811 vld1.64 {d24,d25},[r3,:128]! | |
812 vmls.f32 d21, d2, d5 | |
813 vld1.64 {d6,d7}, [r4,:128], r5 | |
814 vmov q1, q9 | |
815 vrev64.32 q11, q11 | |
816 vmov q2, q12 | |
817 vswp d22, d23 | |
818 vst1.64 {d20,d21},[r0,:128]! | |
819 vst1.64 {d22,d23},[ip,:128], r5 | |
820 b 1b | |
821 2: vmla.f32 d22, d3, d7 | |
822 vmla.f32 d23, d2, d6 | |
823 vmls.f32 d20, d3, d4 | |
824 vmls.f32 d21, d2, d5 | |
825 vrev64.32 q11, q11 | |
826 vswp d22, d23 | |
827 vst1.64 {d20,d21},[r0,:128]! | |
828 vst1.64 {d22,d23},[ip,:128], r5 | |
829 pop {r4,r5,pc} | |
11443 | 830 endfunc |
10046 | 831 |
832 #if CONFIG_VORBIS_DECODER | |
833 function ff_vorbis_inverse_coupling_neon, export=1 | |
834 vmov.i32 q10, #1<<31 | |
835 subs r2, r2, #4 | |
836 mov r3, r0 | |
837 mov r12, r1 | |
838 beq 3f | |
839 | |
840 vld1.32 {d24-d25},[r1,:128]! | |
841 vld1.32 {d22-d23},[r0,:128]! | |
842 vcle.s32 q8, q12, #0 | |
843 vand q9, q11, q10 | |
844 veor q12, q12, q9 | |
845 vand q2, q12, q8 | |
846 vbic q3, q12, q8 | |
847 vadd.f32 q12, q11, q2 | |
848 vsub.f32 q11, q11, q3 | |
849 1: vld1.32 {d2-d3}, [r1,:128]! | |
850 vld1.32 {d0-d1}, [r0,:128]! | |
851 vcle.s32 q8, q1, #0 | |
852 vand q9, q0, q10 | |
853 veor q1, q1, q9 | |
854 vst1.32 {d24-d25},[r3, :128]! | |
855 vst1.32 {d22-d23},[r12,:128]! | |
856 vand q2, q1, q8 | |
857 vbic q3, q1, q8 | |
858 vadd.f32 q1, q0, q2 | |
859 vsub.f32 q0, q0, q3 | |
860 subs r2, r2, #8 | |
861 ble 2f | |
862 vld1.32 {d24-d25},[r1,:128]! | |
863 vld1.32 {d22-d23},[r0,:128]! | |
864 vcle.s32 q8, q12, #0 | |
865 vand q9, q11, q10 | |
866 veor q12, q12, q9 | |
867 vst1.32 {d2-d3}, [r3, :128]! | |
868 vst1.32 {d0-d1}, [r12,:128]! | |
869 vand q2, q12, q8 | |
870 vbic q3, q12, q8 | |
871 vadd.f32 q12, q11, q2 | |
872 vsub.f32 q11, q11, q3 | |
873 b 1b | |
874 | |
875 2: vst1.32 {d2-d3}, [r3, :128]! | |
876 vst1.32 {d0-d1}, [r12,:128]! | |
877 bxlt lr | |
878 | |
879 3: vld1.32 {d2-d3}, [r1,:128] | |
880 vld1.32 {d0-d1}, [r0,:128] | |
881 vcle.s32 q8, q1, #0 | |
882 vand q9, q0, q10 | |
883 veor q1, q1, q9 | |
884 vand q2, q1, q8 | |
885 vbic q3, q1, q8 | |
886 vadd.f32 q1, q0, q2 | |
887 vsub.f32 q0, q0, q3 | |
888 vst1.32 {d2-d3}, [r0,:128]! | |
889 vst1.32 {d0-d1}, [r1,:128]! | |
890 bx lr | |
11443 | 891 endfunc |
10046 | 892 #endif |
10221 | 893 |
894 function ff_vector_fmul_scalar_neon, export=1 | |
895 VFP len .req r2 | |
896 NOVFP len .req r3 | |
897 VFP vdup.32 q8, d0[0] | |
898 NOVFP vdup.32 q8, r2 | |
899 bics r12, len, #15 | |
900 beq 3f | |
901 vld1.32 {q0},[r1,:128]! | |
902 vld1.32 {q1},[r1,:128]! | |
903 1: vmul.f32 q0, q0, q8 | |
904 vld1.32 {q2},[r1,:128]! | |
905 vmul.f32 q1, q1, q8 | |
906 vld1.32 {q3},[r1,:128]! | |
907 vmul.f32 q2, q2, q8 | |
908 vst1.32 {q0},[r0,:128]! | |
909 vmul.f32 q3, q3, q8 | |
910 vst1.32 {q1},[r0,:128]! | |
911 subs r12, r12, #16 | |
912 beq 2f | |
913 vld1.32 {q0},[r1,:128]! | |
914 vst1.32 {q2},[r0,:128]! | |
915 vld1.32 {q1},[r1,:128]! | |
916 vst1.32 {q3},[r0,:128]! | |
917 b 1b | |
918 2: vst1.32 {q2},[r0,:128]! | |
919 vst1.32 {q3},[r0,:128]! | |
920 ands len, len, #15 | |
921 bxeq lr | |
922 3: vld1.32 {q0},[r1,:128]! | |
923 vmul.f32 q0, q0, q8 | |
924 vst1.32 {q0},[r0,:128]! | |
925 subs len, len, #4 | |
926 bgt 3b | |
927 bx lr | |
928 .unreq len | |
11443 | 929 endfunc |
10221 | 930 |
931 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
932 VFP vdup.32 d16, d0[0] | |
933 NOVFP vdup.32 d16, r3 | |
934 NOVFP ldr r3, [sp] | |
935 vld1.32 {d0},[r1,:64]! | |
936 vld1.32 {d1},[r1,:64]! | |
937 1: subs r3, r3, #4 | |
938 vmul.f32 d4, d0, d16 | |
939 vmul.f32 d5, d1, d16 | |
940 ldr r12, [r2], #4 | |
941 vld1.32 {d2},[r12,:64] | |
942 ldr r12, [r2], #4 | |
943 vld1.32 {d3},[r12,:64] | |
944 vmul.f32 d4, d4, d2 | |
945 vmul.f32 d5, d5, d3 | |
946 beq 2f | |
947 vld1.32 {d0},[r1,:64]! | |
948 vld1.32 {d1},[r1,:64]! | |
949 vst1.32 {d4},[r0,:64]! | |
950 vst1.32 {d5},[r0,:64]! | |
951 b 1b | |
952 2: vst1.32 {d4},[r0,:64]! | |
953 vst1.32 {d5},[r0,:64]! | |
954 bx lr | |
11443 | 955 endfunc |
10221 | 956 |
957 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
958 VFP vdup.32 q10, d0[0] | |
959 NOVFP vdup.32 q10, r3 | |
960 NOVFP ldr r3, [sp] | |
961 push {lr} | |
962 bics lr, r3, #7 | |
963 beq 3f | |
964 vld1.32 {q0},[r1,:128]! | |
965 vld1.32 {q2},[r1,:128]! | |
966 1: ldr r12, [r2], #4 | |
967 vld1.32 {q1},[r12,:128] | |
968 ldr r12, [r2], #4 | |
969 vld1.32 {q3},[r12,:128] | |
970 vmul.f32 q8, q0, q10 | |
971 vmul.f32 q8, q8, q1 | |
972 vmul.f32 q9, q2, q10 | |
973 vmul.f32 q9, q9, q3 | |
974 subs lr, lr, #8 | |
975 beq 2f | |
976 vld1.32 {q0},[r1,:128]! | |
977 vld1.32 {q2},[r1,:128]! | |
978 vst1.32 {q8},[r0,:128]! | |
979 vst1.32 {q9},[r0,:128]! | |
980 b 1b | |
981 2: vst1.32 {q8},[r0,:128]! | |
982 vst1.32 {q9},[r0,:128]! | |
983 ands r3, r3, #7 | |
984 popeq {pc} | |
985 3: vld1.32 {q0},[r1,:128]! | |
986 ldr r12, [r2], #4 | |
987 vld1.32 {q1},[r12,:128] | |
988 vmul.f32 q0, q0, q10 | |
989 vmul.f32 q0, q0, q1 | |
990 vst1.32 {q0},[r0,:128]! | |
991 subs r3, r3, #4 | |
992 bgt 3b | |
993 pop {pc} | |
11443 | 994 endfunc |
10221 | 995 |
996 function ff_sv_fmul_scalar_2_neon, export=1 | |
997 VFP len .req r2 | |
998 NOVFP len .req r3 | |
999 VFP vdup.32 q8, d0[0] | |
1000 NOVFP vdup.32 q8, r2 | |
1001 ldr r12, [r1], #4 | |
1002 vld1.32 {d0},[r12,:64] | |
1003 ldr r12, [r1], #4 | |
1004 vld1.32 {d1},[r12,:64] | |
1005 1: vmul.f32 q1, q0, q8 | |
1006 subs len, len, #4 | |
1007 beq 2f | |
1008 ldr r12, [r1], #4 | |
1009 vld1.32 {d0},[r12,:64] | |
1010 ldr r12, [r1], #4 | |
1011 vld1.32 {d1},[r12,:64] | |
1012 vst1.32 {q1},[r0,:128]! | |
1013 b 1b | |
1014 2: vst1.32 {q1},[r0,:128]! | |
1015 bx lr | |
1016 .unreq len | |
11443 | 1017 endfunc |
10221 | 1018 |
1019 function ff_sv_fmul_scalar_4_neon, export=1 | |
1020 VFP len .req r2 | |
1021 NOVFP len .req r3 | |
1022 VFP vdup.32 q8, d0[0] | |
1023 NOVFP vdup.32 q8, r2 | |
1024 1: ldr r12, [r1], #4 | |
1025 vld1.32 {q0},[r12,:128] | |
1026 vmul.f32 q0, q0, q8 | |
1027 vst1.32 {q0},[r0,:128]! | |
1028 subs len, len, #4 | |
1029 bgt 1b | |
1030 bx lr | |
1031 .unreq len | |
11443 | 1032 endfunc |
10221 | 1033 |
1034 function ff_butterflies_float_neon, export=1 | |
1035 1: vld1.32 {q0},[r0,:128] | |
1036 vld1.32 {q1},[r1,:128] | |
1037 vsub.f32 q2, q0, q1 | |
1038 vadd.f32 q1, q0, q1 | |
1039 vst1.32 {q2},[r1,:128]! | |
1040 vst1.32 {q1},[r0,:128]! | |
1041 subs r2, r2, #4 | |
1042 bgt 1b | |
1043 bx lr | |
11443 | 1044 endfunc |
10228 | 1045 |
1046 function ff_scalarproduct_float_neon, export=1 | |
1047 vmov.f32 q2, #0.0 | |
1048 1: vld1.32 {q0},[r0,:128]! | |
1049 vld1.32 {q1},[r1,:128]! | |
1050 vmla.f32 q2, q0, q1 | |
1051 subs r2, r2, #4 | |
1052 bgt 1b | |
1053 vadd.f32 d0, d4, d5 | |
1054 vpadd.f32 d0, d0, d0 | |
1055 NOVFP vmov.32 r0, d0[0] | |
1056 bx lr | |
11443 | 1057 endfunc |
10253 | 1058 |
1059 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1060 VFP vdup.32 q0, d0[0] | |
1061 VFP len .req r2 | |
1062 NOVFP vdup.32 q0, r2 | |
1063 NOVFP len .req r3 | |
1064 | |
1065 vld1.32 {q1},[r1,:128]! | |
1066 vcvt.f32.s32 q3, q1 | |
1067 vld1.32 {q2},[r1,:128]! | |
1068 vcvt.f32.s32 q8, q2 | |
1069 1: subs len, len, #8 | |
1070 pld [r1, #16] | |
1071 vmul.f32 q9, q3, q0 | |
1072 vmul.f32 q10, q8, q0 | |
1073 beq 2f | |
1074 vld1.32 {q1},[r1,:128]! | |
1075 vcvt.f32.s32 q3, q1 | |
1076 vld1.32 {q2},[r1,:128]! | |
1077 vcvt.f32.s32 q8, q2 | |
1078 vst1.32 {q9}, [r0,:128]! | |
1079 vst1.32 {q10},[r0,:128]! | |
1080 b 1b | |
1081 2: vst1.32 {q9}, [r0,:128]! | |
1082 vst1.32 {q10},[r0,:128]! | |
1083 bx lr | |
1084 .unreq len | |
11443 | 1085 endfunc |
10274 | 1086 |
1087 function ff_vector_fmul_reverse_neon, export=1 | |
1088 add r2, r2, r3, lsl #2 | |
1089 sub r2, r2, #32 | |
1090 mov r12, #-32 | |
1091 vld1.32 {q0-q1}, [r1,:128]! | |
1092 vld1.32 {q2-q3}, [r2,:128], r12 | |
1093 1: pld [r1, #32] | |
1094 vrev64.32 q3, q3 | |
1095 vmul.f32 d16, d0, d7 | |
1096 vmul.f32 d17, d1, d6 | |
1097 pld [r2, #-32] | |
1098 vrev64.32 q2, q2 | |
1099 vmul.f32 d18, d2, d5 | |
1100 vmul.f32 d19, d3, d4 | |
1101 subs r3, r3, #8 | |
1102 beq 2f | |
1103 vld1.32 {q0-q1}, [r1,:128]! | |
1104 vld1.32 {q2-q3}, [r2,:128], r12 | |
1105 vst1.32 {q8-q9}, [r0,:128]! | |
1106 b 1b | |
1107 2: vst1.32 {q8-q9}, [r0,:128]! | |
1108 bx lr | |
11443 | 1109 endfunc |
10276 | 1110 |
10302 | 1111 function ff_vector_fmul_add_neon, export=1 |
1112 ldr r12, [sp] | |
1113 vld1.32 {q0-q1}, [r1,:128]! | |
1114 vld1.32 {q8-q9}, [r2,:128]! | |
1115 vld1.32 {q2-q3}, [r3,:128]! | |
1116 vmul.f32 q10, q0, q8 | |
1117 vmul.f32 q11, q1, q9 | |
1118 1: vadd.f32 q12, q2, q10 | |
1119 vadd.f32 q13, q3, q11 | |
1120 pld [r1, #16] | |
1121 pld [r2, #16] | |
1122 pld [r3, #16] | |
1123 subs r12, r12, #8 | |
1124 beq 2f | |
1125 vld1.32 {q0}, [r1,:128]! | |
1126 vld1.32 {q8}, [r2,:128]! | |
1127 vmul.f32 q10, q0, q8 | |
1128 vld1.32 {q1}, [r1,:128]! | |
1129 vld1.32 {q9}, [r2,:128]! | |
1130 vmul.f32 q11, q1, q9 | |
1131 vld1.32 {q2-q3}, [r3,:128]! | |
1132 vst1.32 {q12-q13},[r0,:128]! | |
1133 b 1b | |
1134 2: vst1.32 {q12-q13},[r0,:128]! | |
1135 bx lr | |
11443 | 1136 endfunc |
10302 | 1137 |
10276 | 1138 function ff_vector_clipf_neon, export=1 |
1139 VFP vdup.32 q1, d0[1] | |
1140 VFP vdup.32 q0, d0[0] | |
1141 NOVFP vdup.32 q0, r2 | |
1142 NOVFP vdup.32 q1, r3 | |
1143 NOVFP ldr r2, [sp] | |
1144 vld1.f32 {q2},[r1,:128]! | |
1145 vmin.f32 q10, q2, q1 | |
1146 vld1.f32 {q3},[r1,:128]! | |
1147 vmin.f32 q11, q3, q1 | |
1148 1: vmax.f32 q8, q10, q0 | |
1149 vmax.f32 q9, q11, q0 | |
1150 subs r2, r2, #8 | |
1151 beq 2f | |
1152 vld1.f32 {q2},[r1,:128]! | |
1153 vmin.f32 q10, q2, q1 | |
1154 vld1.f32 {q3},[r1,:128]! | |
1155 vmin.f32 q11, q3, q1 | |
1156 vst1.f32 {q8},[r0,:128]! | |
1157 vst1.f32 {q9},[r0,:128]! | |
1158 b 1b | |
1159 2: vst1.f32 {q8},[r0,:128]! | |
1160 vst1.f32 {q9},[r0,:128]! | |
1161 bx lr | |
11443 | 1162 endfunc |