Mercurial > libavcodec.hg
annotate arm/dsputil_neon.S @ 11816:7c2369ec6faa libavcodec
ARM: check struct offsets only when they are used
The offsets differ depending on configuration, so only check them when
they will actually be used. Presently, this is when NEON is enabled.
author | mru |
---|---|
date | Wed, 02 Jun 2010 22:05:25 +0000 |
parents | 659f16d04776 |
children |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
10046 | 22 #include "config.h" |
8334 | 23 #include "asm.S" |
24 | |
25 preserve8 | |
26 .text | |
27 | |
11807 | 28 function ff_clear_block_neon, export=1 |
29 vmov.i16 q0, #0 | |
30 .rept 8 | |
31 vst1.16 {q0}, [r0,:128]! | |
32 .endr | |
33 bx lr | |
34 endfunc | |
35 | |
36 function ff_clear_blocks_neon, export=1 | |
37 vmov.i16 q0, #0 | |
38 .rept 8*6 | |
39 vst1.16 {q0}, [r0,:128]! | |
40 .endr | |
41 bx lr | |
42 endfunc | |
43 | |
8334 | 44 .macro pixels16 avg=0 |
45 .if \avg | |
46 mov ip, r0 | |
47 .endif | |
48 1: vld1.64 {d0, d1}, [r1], r2 | |
49 vld1.64 {d2, d3}, [r1], r2 | |
50 vld1.64 {d4, d5}, [r1], r2 | |
51 pld [r1, r2, lsl #2] | |
52 vld1.64 {d6, d7}, [r1], r2 | |
53 pld [r1] | |
54 pld [r1, r2] | |
55 pld [r1, r2, lsl #1] | |
56 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
57 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 58 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
59 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 60 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
61 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 62 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
63 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 64 vrhadd.u8 q3, q3, q11 |
65 .endif | |
66 subs r3, r3, #4 | |
67 vst1.64 {d0, d1}, [r0,:128], r2 | |
68 vst1.64 {d2, d3}, [r0,:128], r2 | |
69 vst1.64 {d4, d5}, [r0,:128], r2 | |
70 vst1.64 {d6, d7}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_x2 vhadd=vrhadd.u8 | |
76 1: vld1.64 {d0-d2}, [r1], r2 | |
77 vld1.64 {d4-d6}, [r1], r2 | |
78 pld [r1] | |
79 pld [r1, r2] | |
80 subs r3, r3, #2 | |
81 vext.8 q1, q0, q1, #1 | |
82 \vhadd q0, q0, q1 | |
83 vext.8 q3, q2, q3, #1 | |
84 \vhadd q2, q2, q3 | |
85 vst1.64 {d0, d1}, [r0,:128], r2 | |
86 vst1.64 {d4, d5}, [r0,:128], r2 | |
87 bne 1b | |
88 bx lr | |
89 .endm | |
90 | |
91 .macro pixels16_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
92 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d2, d3}, [r1], r2 |
8334 | 94 1: subs r3, r3, #2 |
95 \vhadd q2, q0, q1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
96 vld1.64 {d0, d1}, [r1], r2 |
8334 | 97 \vhadd q3, q0, q1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
98 vld1.64 {d2, d3}, [r1], r2 |
8334 | 99 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
100 pld [r1, r2] |
8334 | 101 vst1.64 {d4, d5}, [r0,:128], r2 |
102 vst1.64 {d6, d7}, [r0,:128], r2 | |
103 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
104 bx lr |
8334 | 105 .endm |
106 | |
107 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
108 vld1.64 {d0-d2}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
109 vld1.64 {d4-d6}, [r1], r2 |
8334 | 110 .if \no_rnd |
111 vmov.i16 q13, #1 | |
112 .endif | |
113 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
114 pld [r1, r2] |
8334 | 115 vext.8 q1, q0, q1, #1 |
116 vext.8 q3, q2, q3, #1 | |
117 vaddl.u8 q8, d0, d2 | |
118 vaddl.u8 q10, d1, d3 | |
119 vaddl.u8 q9, d4, d6 | |
120 vaddl.u8 q11, d5, d7 | |
121 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
122 vld1.64 {d0-d2}, [r1], r2 |
8334 | 123 vadd.u16 q12, q8, q9 |
124 pld [r1] | |
125 .if \no_rnd | |
126 vadd.u16 q12, q12, q13 | |
127 .endif | |
128 vext.8 q15, q0, q1, #1 | |
129 vadd.u16 q1 , q10, q11 | |
130 \vshrn d28, q12, #2 | |
131 .if \no_rnd | |
132 vadd.u16 q1, q1, q13 | |
133 .endif | |
134 \vshrn d29, q1, #2 | |
135 vaddl.u8 q8, d0, d30 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
136 vld1.64 {d2-d4}, [r1], r2 |
8334 | 137 vaddl.u8 q10, d1, d31 |
138 vst1.64 {d28,d29}, [r0,:128], r2 | |
139 vadd.u16 q12, q8, q9 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
140 pld [r1, r2] |
8334 | 141 .if \no_rnd |
142 vadd.u16 q12, q12, q13 | |
143 .endif | |
144 vext.8 q2, q1, q2, #1 | |
145 vadd.u16 q0, q10, q11 | |
146 \vshrn d30, q12, #2 | |
147 .if \no_rnd | |
148 vadd.u16 q0, q0, q13 | |
149 .endif | |
150 \vshrn d31, q0, #2 | |
151 vaddl.u8 q9, d2, d4 | |
152 vaddl.u8 q11, d3, d5 | |
153 vst1.64 {d30,d31}, [r0,:128], r2 | |
154 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
155 bx lr |
8334 | 156 .endm |
157 | |
10375 | 158 .macro pixels8 avg=0 |
8334 | 159 1: vld1.64 {d0}, [r1], r2 |
160 vld1.64 {d1}, [r1], r2 | |
161 vld1.64 {d2}, [r1], r2 | |
162 pld [r1, r2, lsl #2] | |
163 vld1.64 {d3}, [r1], r2 | |
164 pld [r1] | |
165 pld [r1, r2] | |
166 pld [r1, r2, lsl #1] | |
10375 | 167 .if \avg |
168 vld1.64 {d4}, [r0,:64], r2 | |
169 vrhadd.u8 d0, d0, d4 | |
170 vld1.64 {d5}, [r0,:64], r2 | |
171 vrhadd.u8 d1, d1, d5 | |
172 vld1.64 {d6}, [r0,:64], r2 | |
173 vrhadd.u8 d2, d2, d6 | |
174 vld1.64 {d7}, [r0,:64], r2 | |
175 vrhadd.u8 d3, d3, d7 | |
176 sub r0, r0, r2, lsl #2 | |
177 .endif | |
8334 | 178 subs r3, r3, #4 |
179 vst1.64 {d0}, [r0,:64], r2 | |
180 vst1.64 {d1}, [r0,:64], r2 | |
181 vst1.64 {d2}, [r0,:64], r2 | |
182 vst1.64 {d3}, [r0,:64], r2 | |
183 bne 1b | |
184 bx lr | |
185 .endm | |
186 | |
187 .macro pixels8_x2 vhadd=vrhadd.u8 | |
188 1: vld1.64 {d0, d1}, [r1], r2 | |
189 vext.8 d1, d0, d1, #1 | |
190 vld1.64 {d2, d3}, [r1], r2 | |
191 vext.8 d3, d2, d3, #1 | |
192 pld [r1] | |
193 pld [r1, r2] | |
194 subs r3, r3, #2 | |
195 vswp d1, d2 | |
196 \vhadd q0, q0, q1 | |
197 vst1.64 {d0}, [r0,:64], r2 | |
198 vst1.64 {d1}, [r0,:64], r2 | |
199 bne 1b | |
200 bx lr | |
201 .endm | |
202 | |
203 .macro pixels8_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
204 vld1.64 {d0}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
205 vld1.64 {d1}, [r1], r2 |
8334 | 206 1: subs r3, r3, #2 |
207 \vhadd d4, d0, d1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
208 vld1.64 {d0}, [r1], r2 |
8334 | 209 \vhadd d5, d0, d1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
210 vld1.64 {d1}, [r1], r2 |
8334 | 211 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
212 pld [r1, r2] |
8334 | 213 vst1.64 {d4}, [r0,:64], r2 |
214 vst1.64 {d5}, [r0,:64], r2 | |
215 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
216 bx lr |
8334 | 217 .endm |
218 | |
219 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
220 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
221 vld1.64 {d2, d3}, [r1], r2 |
8334 | 222 .if \no_rnd |
223 vmov.i16 q11, #1 | |
224 .endif | |
225 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
226 pld [r1, r2] |
8334 | 227 vext.8 d4, d0, d1, #1 |
228 vext.8 d6, d2, d3, #1 | |
229 vaddl.u8 q8, d0, d4 | |
230 vaddl.u8 q9, d2, d6 | |
231 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
232 vld1.64 {d0, d1}, [r1], r2 |
8334 | 233 pld [r1] |
234 vadd.u16 q10, q8, q9 | |
235 vext.8 d4, d0, d1, #1 | |
236 .if \no_rnd | |
237 vadd.u16 q10, q10, q11 | |
238 .endif | |
239 vaddl.u8 q8, d0, d4 | |
240 \vshrn d5, q10, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
241 vld1.64 {d2, d3}, [r1], r2 |
8334 | 242 vadd.u16 q10, q8, q9 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
243 pld [r1, r2] |
8334 | 244 .if \no_rnd |
245 vadd.u16 q10, q10, q11 | |
246 .endif | |
247 vst1.64 {d5}, [r0,:64], r2 | |
248 \vshrn d7, q10, #2 | |
249 vext.8 d6, d2, d3, #1 | |
250 vaddl.u8 q9, d2, d6 | |
251 vst1.64 {d7}, [r0,:64], r2 | |
252 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
253 bx lr |
8334 | 254 .endm |
255 | |
256 .macro pixfunc pfx name suf rnd_op args:vararg | |
257 function ff_\pfx\name\suf\()_neon, export=1 | |
258 \name \rnd_op \args | |
11443 | 259 endfunc |
8334 | 260 .endm |
261 | |
262 .macro pixfunc2 pfx name args:vararg | |
263 pixfunc \pfx \name | |
264 pixfunc \pfx \name \args | |
265 .endm | |
266 | |
267 function ff_put_h264_qpel16_mc00_neon, export=1 | |
10376 | 268 mov r3, #16 |
11443 | 269 endfunc |
8334 | 270 |
271 pixfunc put_ pixels16 | |
272 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
274 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
275 | |
276 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
10376 | 277 mov r3, #16 |
11443 | 278 endfunc |
8334 | 279 |
280 pixfunc avg_ pixels16,, 1 | |
281 | |
282 function ff_put_h264_qpel8_mc00_neon, export=1 | |
10376 | 283 mov r3, #8 |
11443 | 284 endfunc |
8334 | 285 |
286 pixfunc put_ pixels8 | |
287 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
288 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
289 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 290 |
10375 | 291 function ff_avg_h264_qpel8_mc00_neon, export=1 |
292 mov r3, #8 | |
11443 | 293 endfunc |
10375 | 294 |
295 pixfunc avg_ pixels8,, 1 | |
296 | |
9580 | 297 function ff_put_pixels_clamped_neon, export=1 |
298 vld1.64 {d16-d19}, [r0,:128]! | |
299 vqmovun.s16 d0, q8 | |
300 vld1.64 {d20-d23}, [r0,:128]! | |
301 vqmovun.s16 d1, q9 | |
302 vld1.64 {d24-d27}, [r0,:128]! | |
303 vqmovun.s16 d2, q10 | |
304 vld1.64 {d28-d31}, [r0,:128]! | |
305 vqmovun.s16 d3, q11 | |
306 vst1.64 {d0}, [r1,:64], r2 | |
307 vqmovun.s16 d4, q12 | |
308 vst1.64 {d1}, [r1,:64], r2 | |
309 vqmovun.s16 d5, q13 | |
310 vst1.64 {d2}, [r1,:64], r2 | |
311 vqmovun.s16 d6, q14 | |
312 vst1.64 {d3}, [r1,:64], r2 | |
313 vqmovun.s16 d7, q15 | |
314 vst1.64 {d4}, [r1,:64], r2 | |
315 vst1.64 {d5}, [r1,:64], r2 | |
316 vst1.64 {d6}, [r1,:64], r2 | |
317 vst1.64 {d7}, [r1,:64], r2 | |
318 bx lr | |
11443 | 319 endfunc |
9580 | 320 |
9345 | 321 function ff_put_signed_pixels_clamped_neon, export=1 |
322 vmov.u8 d31, #128 | |
323 vld1.64 {d16-d17}, [r0,:128]! | |
324 vqmovn.s16 d0, q8 | |
325 vld1.64 {d18-d19}, [r0,:128]! | |
326 vqmovn.s16 d1, q9 | |
327 vld1.64 {d16-d17}, [r0,:128]! | |
328 vqmovn.s16 d2, q8 | |
329 vld1.64 {d18-d19}, [r0,:128]! | |
330 vadd.u8 d0, d0, d31 | |
331 vld1.64 {d20-d21}, [r0,:128]! | |
332 vadd.u8 d1, d1, d31 | |
333 vld1.64 {d22-d23}, [r0,:128]! | |
334 vadd.u8 d2, d2, d31 | |
335 vst1.64 {d0}, [r1,:64], r2 | |
336 vqmovn.s16 d3, q9 | |
337 vst1.64 {d1}, [r1,:64], r2 | |
338 vqmovn.s16 d4, q10 | |
339 vst1.64 {d2}, [r1,:64], r2 | |
340 vqmovn.s16 d5, q11 | |
341 vld1.64 {d24-d25}, [r0,:128]! | |
342 vadd.u8 d3, d3, d31 | |
343 vld1.64 {d26-d27}, [r0,:128]! | |
344 vadd.u8 d4, d4, d31 | |
345 vadd.u8 d5, d5, d31 | |
346 vst1.64 {d3}, [r1,:64], r2 | |
347 vqmovn.s16 d6, q12 | |
348 vst1.64 {d4}, [r1,:64], r2 | |
349 vqmovn.s16 d7, q13 | |
350 vst1.64 {d5}, [r1,:64], r2 | |
351 vadd.u8 d6, d6, d31 | |
352 vadd.u8 d7, d7, d31 | |
353 vst1.64 {d6}, [r1,:64], r2 | |
354 vst1.64 {d7}, [r1,:64], r2 | |
355 bx lr | |
11443 | 356 endfunc |
9345 | 357 |
9344 | 358 function ff_add_pixels_clamped_neon, export=1 |
359 mov r3, r1 | |
360 vld1.64 {d16}, [r1,:64], r2 | |
361 vld1.64 {d0-d1}, [r0,:128]! | |
362 vaddw.u8 q0, q0, d16 | |
363 vld1.64 {d17}, [r1,:64], r2 | |
364 vld1.64 {d2-d3}, [r0,:128]! | |
365 vqmovun.s16 d0, q0 | |
366 vld1.64 {d18}, [r1,:64], r2 | |
367 vaddw.u8 q1, q1, d17 | |
368 vld1.64 {d4-d5}, [r0,:128]! | |
369 vaddw.u8 q2, q2, d18 | |
370 vst1.64 {d0}, [r3,:64], r2 | |
371 vqmovun.s16 d2, q1 | |
372 vld1.64 {d19}, [r1,:64], r2 | |
373 vld1.64 {d6-d7}, [r0,:128]! | |
374 vaddw.u8 q3, q3, d19 | |
375 vqmovun.s16 d4, q2 | |
376 vst1.64 {d2}, [r3,:64], r2 | |
377 vld1.64 {d16}, [r1,:64], r2 | |
378 vqmovun.s16 d6, q3 | |
379 vld1.64 {d0-d1}, [r0,:128]! | |
380 vaddw.u8 q0, q0, d16 | |
381 vst1.64 {d4}, [r3,:64], r2 | |
382 vld1.64 {d17}, [r1,:64], r2 | |
383 vld1.64 {d2-d3}, [r0,:128]! | |
384 vaddw.u8 q1, q1, d17 | |
385 vst1.64 {d6}, [r3,:64], r2 | |
386 vqmovun.s16 d0, q0 | |
387 vld1.64 {d18}, [r1,:64], r2 | |
388 vld1.64 {d4-d5}, [r0,:128]! | |
389 vaddw.u8 q2, q2, d18 | |
390 vst1.64 {d0}, [r3,:64], r2 | |
391 vqmovun.s16 d2, q1 | |
392 vld1.64 {d19}, [r1,:64], r2 | |
393 vqmovun.s16 d4, q2 | |
394 vld1.64 {d6-d7}, [r0,:128]! | |
395 vaddw.u8 q3, q3, d19 | |
396 vst1.64 {d2}, [r3,:64], r2 | |
397 vqmovun.s16 d6, q3 | |
398 vst1.64 {d4}, [r3,:64], r2 | |
399 vst1.64 {d6}, [r3,:64], r2 | |
400 bx lr | |
11443 | 401 endfunc |
9344 | 402 |
8492 | 403 function ff_float_to_int16_neon, export=1 |
404 subs r2, r2, #8 | |
405 vld1.64 {d0-d1}, [r1,:128]! | |
406 vcvt.s32.f32 q8, q0, #16 | |
407 vld1.64 {d2-d3}, [r1,:128]! | |
408 vcvt.s32.f32 q9, q1, #16 | |
409 beq 3f | |
410 bics ip, r2, #15 | |
411 beq 2f | |
412 1: subs ip, ip, #16 | |
413 vshrn.s32 d4, q8, #16 | |
414 vld1.64 {d0-d1}, [r1,:128]! | |
415 vcvt.s32.f32 q0, q0, #16 | |
416 vshrn.s32 d5, q9, #16 | |
417 vld1.64 {d2-d3}, [r1,:128]! | |
418 vcvt.s32.f32 q1, q1, #16 | |
419 vshrn.s32 d6, q0, #16 | |
420 vst1.64 {d4-d5}, [r0,:128]! | |
421 vshrn.s32 d7, q1, #16 | |
422 vld1.64 {d16-d17},[r1,:128]! | |
423 vcvt.s32.f32 q8, q8, #16 | |
424 vld1.64 {d18-d19},[r1,:128]! | |
425 vcvt.s32.f32 q9, q9, #16 | |
426 vst1.64 {d6-d7}, [r0,:128]! | |
427 bne 1b | |
428 ands r2, r2, #15 | |
429 beq 3f | |
430 2: vld1.64 {d0-d1}, [r1,:128]! | |
431 vshrn.s32 d4, q8, #16 | |
432 vcvt.s32.f32 q0, q0, #16 | |
433 vld1.64 {d2-d3}, [r1,:128]! | |
434 vshrn.s32 d5, q9, #16 | |
435 vcvt.s32.f32 q1, q1, #16 | |
436 vshrn.s32 d6, q0, #16 | |
437 vst1.64 {d4-d5}, [r0,:128]! | |
438 vshrn.s32 d7, q1, #16 | |
439 vst1.64 {d6-d7}, [r0,:128]! | |
440 bx lr | |
441 3: vshrn.s32 d4, q8, #16 | |
442 vshrn.s32 d5, q9, #16 | |
443 vst1.64 {d4-d5}, [r0,:128]! | |
444 bx lr | |
11443 | 445 endfunc |
8492 | 446 |
447 function ff_float_to_int16_interleave_neon, export=1 | |
448 cmp r3, #2 | |
449 ldrlt r1, [r1] | |
450 blt ff_float_to_int16_neon | |
451 bne 4f | |
452 | |
453 ldr r3, [r1] | |
454 ldr r1, [r1, #4] | |
455 | |
456 subs r2, r2, #8 | |
457 vld1.64 {d0-d1}, [r3,:128]! | |
458 vcvt.s32.f32 q8, q0, #16 | |
459 vld1.64 {d2-d3}, [r3,:128]! | |
460 vcvt.s32.f32 q9, q1, #16 | |
461 vld1.64 {d20-d21},[r1,:128]! | |
462 vcvt.s32.f32 q10, q10, #16 | |
463 vld1.64 {d22-d23},[r1,:128]! | |
464 vcvt.s32.f32 q11, q11, #16 | |
465 beq 3f | |
466 bics ip, r2, #15 | |
467 beq 2f | |
468 1: subs ip, ip, #16 | |
469 vld1.64 {d0-d1}, [r3,:128]! | |
470 vcvt.s32.f32 q0, q0, #16 | |
471 vsri.32 q10, q8, #16 | |
472 vld1.64 {d2-d3}, [r3,:128]! | |
473 vcvt.s32.f32 q1, q1, #16 | |
474 vld1.64 {d24-d25},[r1,:128]! | |
475 vcvt.s32.f32 q12, q12, #16 | |
476 vld1.64 {d26-d27},[r1,:128]! | |
477 vsri.32 q11, q9, #16 | |
478 vst1.64 {d20-d21},[r0,:128]! | |
479 vcvt.s32.f32 q13, q13, #16 | |
480 vst1.64 {d22-d23},[r0,:128]! | |
481 vsri.32 q12, q0, #16 | |
482 vld1.64 {d16-d17},[r3,:128]! | |
483 vsri.32 q13, q1, #16 | |
484 vst1.64 {d24-d25},[r0,:128]! | |
485 vcvt.s32.f32 q8, q8, #16 | |
486 vld1.64 {d18-d19},[r3,:128]! | |
487 vcvt.s32.f32 q9, q9, #16 | |
488 vld1.64 {d20-d21},[r1,:128]! | |
489 vcvt.s32.f32 q10, q10, #16 | |
490 vld1.64 {d22-d23},[r1,:128]! | |
491 vcvt.s32.f32 q11, q11, #16 | |
492 vst1.64 {d26-d27},[r0,:128]! | |
493 bne 1b | |
494 ands r2, r2, #15 | |
495 beq 3f | |
496 2: vsri.32 q10, q8, #16 | |
497 vld1.64 {d0-d1}, [r3,:128]! | |
498 vcvt.s32.f32 q0, q0, #16 | |
499 vld1.64 {d2-d3}, [r3,:128]! | |
500 vcvt.s32.f32 q1, q1, #16 | |
501 vld1.64 {d24-d25},[r1,:128]! | |
502 vcvt.s32.f32 q12, q12, #16 | |
503 vsri.32 q11, q9, #16 | |
504 vld1.64 {d26-d27},[r1,:128]! | |
505 vcvt.s32.f32 q13, q13, #16 | |
506 vst1.64 {d20-d21},[r0,:128]! | |
507 vsri.32 q12, q0, #16 | |
508 vst1.64 {d22-d23},[r0,:128]! | |
509 vsri.32 q13, q1, #16 | |
510 vst1.64 {d24-d27},[r0,:128]! | |
511 bx lr | |
512 3: vsri.32 q10, q8, #16 | |
513 vsri.32 q11, q9, #16 | |
514 vst1.64 {d20-d23},[r0,:128]! | |
515 bx lr | |
516 | |
517 4: push {r4-r8,lr} | |
518 cmp r3, #4 | |
519 lsl ip, r3, #1 | |
520 blt 4f | |
521 | |
522 @ 4 channels | |
523 5: ldmia r1!, {r4-r7} | |
524 mov lr, r2 | |
525 mov r8, r0 | |
526 vld1.64 {d16-d17},[r4,:128]! | |
527 vcvt.s32.f32 q8, q8, #16 | |
528 vld1.64 {d18-d19},[r5,:128]! | |
529 vcvt.s32.f32 q9, q9, #16 | |
530 vld1.64 {d20-d21},[r6,:128]! | |
531 vcvt.s32.f32 q10, q10, #16 | |
532 vld1.64 {d22-d23},[r7,:128]! | |
533 vcvt.s32.f32 q11, q11, #16 | |
534 6: subs lr, lr, #8 | |
535 vld1.64 {d0-d1}, [r4,:128]! | |
536 vcvt.s32.f32 q0, q0, #16 | |
537 vsri.32 q9, q8, #16 | |
538 vld1.64 {d2-d3}, [r5,:128]! | |
539 vcvt.s32.f32 q1, q1, #16 | |
540 vsri.32 q11, q10, #16 | |
541 vld1.64 {d4-d5}, [r6,:128]! | |
542 vcvt.s32.f32 q2, q2, #16 | |
543 vzip.32 d18, d22 | |
544 vld1.64 {d6-d7}, [r7,:128]! | |
545 vcvt.s32.f32 q3, q3, #16 | |
546 vzip.32 d19, d23 | |
547 vst1.64 {d18}, [r8], ip | |
548 vsri.32 q1, q0, #16 | |
549 vst1.64 {d22}, [r8], ip | |
550 vsri.32 q3, q2, #16 | |
551 vst1.64 {d19}, [r8], ip | |
552 vzip.32 d2, d6 | |
553 vst1.64 {d23}, [r8], ip | |
554 vzip.32 d3, d7 | |
555 beq 7f | |
556 vld1.64 {d16-d17},[r4,:128]! | |
557 vcvt.s32.f32 q8, q8, #16 | |
558 vst1.64 {d2}, [r8], ip | |
559 vld1.64 {d18-d19},[r5,:128]! | |
560 vcvt.s32.f32 q9, q9, #16 | |
561 vst1.64 {d6}, [r8], ip | |
562 vld1.64 {d20-d21},[r6,:128]! | |
563 vcvt.s32.f32 q10, q10, #16 | |
564 vst1.64 {d3}, [r8], ip | |
565 vld1.64 {d22-d23},[r7,:128]! | |
566 vcvt.s32.f32 q11, q11, #16 | |
567 vst1.64 {d7}, [r8], ip | |
568 b 6b | |
569 7: vst1.64 {d2}, [r8], ip | |
570 vst1.64 {d6}, [r8], ip | |
571 vst1.64 {d3}, [r8], ip | |
572 vst1.64 {d7}, [r8], ip | |
573 subs r3, r3, #4 | |
574 popeq {r4-r8,pc} | |
575 cmp r3, #4 | |
576 add r0, r0, #8 | |
577 bge 5b | |
578 | |
579 @ 2 channels | |
580 4: cmp r3, #2 | |
581 blt 4f | |
582 ldmia r1!, {r4-r5} | |
583 mov lr, r2 | |
584 mov r8, r0 | |
585 tst lr, #8 | |
586 vld1.64 {d16-d17},[r4,:128]! | |
587 vcvt.s32.f32 q8, q8, #16 | |
588 vld1.64 {d18-d19},[r5,:128]! | |
589 vcvt.s32.f32 q9, q9, #16 | |
590 vld1.64 {d20-d21},[r4,:128]! | |
591 vcvt.s32.f32 q10, q10, #16 | |
592 vld1.64 {d22-d23},[r5,:128]! | |
593 vcvt.s32.f32 q11, q11, #16 | |
594 beq 6f | |
595 subs lr, lr, #8 | |
596 beq 7f | |
597 vsri.32 d18, d16, #16 | |
598 vsri.32 d19, d17, #16 | |
599 vld1.64 {d16-d17},[r4,:128]! | |
600 vcvt.s32.f32 q8, q8, #16 | |
601 vst1.32 {d18[0]}, [r8], ip | |
602 vsri.32 d22, d20, #16 | |
603 vst1.32 {d18[1]}, [r8], ip | |
604 vsri.32 d23, d21, #16 | |
605 vst1.32 {d19[0]}, [r8], ip | |
606 vst1.32 {d19[1]}, [r8], ip | |
607 vld1.64 {d18-d19},[r5,:128]! | |
608 vcvt.s32.f32 q9, q9, #16 | |
609 vst1.32 {d22[0]}, [r8], ip | |
610 vst1.32 {d22[1]}, [r8], ip | |
611 vld1.64 {d20-d21},[r4,:128]! | |
612 vcvt.s32.f32 q10, q10, #16 | |
613 vst1.32 {d23[0]}, [r8], ip | |
614 vst1.32 {d23[1]}, [r8], ip | |
615 vld1.64 {d22-d23},[r5,:128]! | |
616 vcvt.s32.f32 q11, q11, #16 | |
617 6: subs lr, lr, #16 | |
618 vld1.64 {d0-d1}, [r4,:128]! | |
619 vcvt.s32.f32 q0, q0, #16 | |
620 vsri.32 d18, d16, #16 | |
621 vld1.64 {d2-d3}, [r5,:128]! | |
622 vcvt.s32.f32 q1, q1, #16 | |
623 vsri.32 d19, d17, #16 | |
624 vld1.64 {d4-d5}, [r4,:128]! | |
625 vcvt.s32.f32 q2, q2, #16 | |
626 vld1.64 {d6-d7}, [r5,:128]! | |
627 vcvt.s32.f32 q3, q3, #16 | |
628 vst1.32 {d18[0]}, [r8], ip | |
629 vsri.32 d22, d20, #16 | |
630 vst1.32 {d18[1]}, [r8], ip | |
631 vsri.32 d23, d21, #16 | |
632 vst1.32 {d19[0]}, [r8], ip | |
633 vsri.32 d2, d0, #16 | |
634 vst1.32 {d19[1]}, [r8], ip | |
635 vsri.32 d3, d1, #16 | |
636 vst1.32 {d22[0]}, [r8], ip | |
637 vsri.32 d6, d4, #16 | |
638 vst1.32 {d22[1]}, [r8], ip | |
639 vsri.32 d7, d5, #16 | |
640 vst1.32 {d23[0]}, [r8], ip | |
641 vst1.32 {d23[1]}, [r8], ip | |
642 beq 6f | |
643 vld1.64 {d16-d17},[r4,:128]! | |
644 vcvt.s32.f32 q8, q8, #16 | |
645 vst1.32 {d2[0]}, [r8], ip | |
646 vst1.32 {d2[1]}, [r8], ip | |
647 vld1.64 {d18-d19},[r5,:128]! | |
648 vcvt.s32.f32 q9, q9, #16 | |
649 vst1.32 {d3[0]}, [r8], ip | |
650 vst1.32 {d3[1]}, [r8], ip | |
651 vld1.64 {d20-d21},[r4,:128]! | |
652 vcvt.s32.f32 q10, q10, #16 | |
653 vst1.32 {d6[0]}, [r8], ip | |
654 vst1.32 {d6[1]}, [r8], ip | |
655 vld1.64 {d22-d23},[r5,:128]! | |
656 vcvt.s32.f32 q11, q11, #16 | |
657 vst1.32 {d7[0]}, [r8], ip | |
658 vst1.32 {d7[1]}, [r8], ip | |
659 bgt 6b | |
660 6: vst1.32 {d2[0]}, [r8], ip | |
661 vst1.32 {d2[1]}, [r8], ip | |
662 vst1.32 {d3[0]}, [r8], ip | |
663 vst1.32 {d3[1]}, [r8], ip | |
664 vst1.32 {d6[0]}, [r8], ip | |
665 vst1.32 {d6[1]}, [r8], ip | |
666 vst1.32 {d7[0]}, [r8], ip | |
667 vst1.32 {d7[1]}, [r8], ip | |
668 b 8f | |
669 7: vsri.32 d18, d16, #16 | |
670 vsri.32 d19, d17, #16 | |
671 vst1.32 {d18[0]}, [r8], ip | |
672 vsri.32 d22, d20, #16 | |
673 vst1.32 {d18[1]}, [r8], ip | |
674 vsri.32 d23, d21, #16 | |
675 vst1.32 {d19[0]}, [r8], ip | |
676 vst1.32 {d19[1]}, [r8], ip | |
677 vst1.32 {d22[0]}, [r8], ip | |
678 vst1.32 {d22[1]}, [r8], ip | |
679 vst1.32 {d23[0]}, [r8], ip | |
680 vst1.32 {d23[1]}, [r8], ip | |
681 8: subs r3, r3, #2 | |
682 add r0, r0, #4 | |
683 popeq {r4-r8,pc} | |
684 | |
685 @ 1 channel | |
686 4: ldr r4, [r1],#4 | |
687 tst r2, #8 | |
688 mov lr, r2 | |
689 mov r5, r0 | |
690 vld1.64 {d0-d1}, [r4,:128]! | |
691 vcvt.s32.f32 q0, q0, #16 | |
692 vld1.64 {d2-d3}, [r4,:128]! | |
693 vcvt.s32.f32 q1, q1, #16 | |
694 bne 8f | |
695 6: subs lr, lr, #16 | |
696 vld1.64 {d4-d5}, [r4,:128]! | |
697 vcvt.s32.f32 q2, q2, #16 | |
698 vld1.64 {d6-d7}, [r4,:128]! | |
699 vcvt.s32.f32 q3, q3, #16 | |
700 vst1.16 {d0[1]}, [r5,:16], ip | |
701 vst1.16 {d0[3]}, [r5,:16], ip | |
702 vst1.16 {d1[1]}, [r5,:16], ip | |
703 vst1.16 {d1[3]}, [r5,:16], ip | |
704 vst1.16 {d2[1]}, [r5,:16], ip | |
705 vst1.16 {d2[3]}, [r5,:16], ip | |
706 vst1.16 {d3[1]}, [r5,:16], ip | |
707 vst1.16 {d3[3]}, [r5,:16], ip | |
708 beq 7f | |
709 vld1.64 {d0-d1}, [r4,:128]! | |
710 vcvt.s32.f32 q0, q0, #16 | |
711 vld1.64 {d2-d3}, [r4,:128]! | |
712 vcvt.s32.f32 q1, q1, #16 | |
713 7: vst1.16 {d4[1]}, [r5,:16], ip | |
714 vst1.16 {d4[3]}, [r5,:16], ip | |
715 vst1.16 {d5[1]}, [r5,:16], ip | |
716 vst1.16 {d5[3]}, [r5,:16], ip | |
717 vst1.16 {d6[1]}, [r5,:16], ip | |
718 vst1.16 {d6[3]}, [r5,:16], ip | |
719 vst1.16 {d7[1]}, [r5,:16], ip | |
720 vst1.16 {d7[3]}, [r5,:16], ip | |
721 bgt 6b | |
722 pop {r4-r8,pc} | |
723 8: subs lr, lr, #8 | |
724 vst1.16 {d0[1]}, [r5,:16], ip | |
725 vst1.16 {d0[3]}, [r5,:16], ip | |
726 vst1.16 {d1[1]}, [r5,:16], ip | |
727 vst1.16 {d1[3]}, [r5,:16], ip | |
728 vst1.16 {d2[1]}, [r5,:16], ip | |
729 vst1.16 {d2[3]}, [r5,:16], ip | |
730 vst1.16 {d3[1]}, [r5,:16], ip | |
731 vst1.16 {d3[3]}, [r5,:16], ip | |
732 popeq {r4-r8,pc} | |
733 vld1.64 {d0-d1}, [r4,:128]! | |
734 vcvt.s32.f32 q0, q0, #16 | |
735 vld1.64 {d2-d3}, [r4,:128]! | |
736 vcvt.s32.f32 q1, q1, #16 | |
737 b 6b | |
11443 | 738 endfunc |
8697 | 739 |
740 function ff_vector_fmul_neon, export=1 | |
741 mov r3, r0 | |
742 subs r2, r2, #8 | |
743 vld1.64 {d0-d3}, [r0,:128]! | |
744 vld1.64 {d4-d7}, [r1,:128]! | |
745 vmul.f32 q8, q0, q2 | |
746 vmul.f32 q9, q1, q3 | |
747 beq 3f | |
748 bics ip, r2, #15 | |
749 beq 2f | |
750 1: subs ip, ip, #16 | |
751 vld1.64 {d0-d1}, [r0,:128]! | |
752 vld1.64 {d4-d5}, [r1,:128]! | |
753 vmul.f32 q10, q0, q2 | |
754 vld1.64 {d2-d3}, [r0,:128]! | |
755 vld1.64 {d6-d7}, [r1,:128]! | |
756 vmul.f32 q11, q1, q3 | |
757 vst1.64 {d16-d19},[r3,:128]! | |
758 vld1.64 {d0-d1}, [r0,:128]! | |
759 vld1.64 {d4-d5}, [r1,:128]! | |
760 vmul.f32 q8, q0, q2 | |
761 vld1.64 {d2-d3}, [r0,:128]! | |
762 vld1.64 {d6-d7}, [r1,:128]! | |
763 vmul.f32 q9, q1, q3 | |
764 vst1.64 {d20-d23},[r3,:128]! | |
765 bne 1b | |
766 ands r2, r2, #15 | |
767 beq 3f | |
768 2: vld1.64 {d0-d1}, [r0,:128]! | |
769 vld1.64 {d4-d5}, [r1,:128]! | |
770 vst1.64 {d16-d17},[r3,:128]! | |
771 vmul.f32 q8, q0, q2 | |
772 vld1.64 {d2-d3}, [r0,:128]! | |
773 vld1.64 {d6-d7}, [r1,:128]! | |
774 vst1.64 {d18-d19},[r3,:128]! | |
775 vmul.f32 q9, q1, q3 | |
776 3: vst1.64 {d16-d19},[r3,:128]! | |
777 bx lr | |
11443 | 778 endfunc |
8698 | 779 |
780 function ff_vector_fmul_window_neon, export=1 | |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
781 VFP vdup.32 q8, d0[0] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
782 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
8698 | 783 push {r4,r5,lr} |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
784 VFP ldr lr, [sp, #12] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
785 NOVFP ldr lr, [sp, #16] |
8698 | 786 sub r2, r2, #8 |
787 sub r5, lr, #2 | |
788 add r2, r2, r5, lsl #2 | |
789 add r4, r3, r5, lsl #3 | |
790 add ip, r0, r5, lsl #3 | |
791 mov r5, #-16 | |
792 vld1.64 {d0,d1}, [r1,:128]! | |
793 vld1.64 {d2,d3}, [r2,:128], r5 | |
794 vld1.64 {d4,d5}, [r3,:128]! | |
795 vld1.64 {d6,d7}, [r4,:128], r5 | |
796 1: subs lr, lr, #4 | |
797 vmov q11, q8 | |
798 vmla.f32 d22, d0, d4 | |
799 vmov q10, q8 | |
800 vmla.f32 d23, d1, d5 | |
801 vrev64.32 q3, q3 | |
802 vmla.f32 d20, d0, d7 | |
803 vrev64.32 q1, q1 | |
804 vmla.f32 d21, d1, d6 | |
805 beq 2f | |
806 vmla.f32 d22, d3, d7 | |
807 vld1.64 {d0,d1}, [r1,:128]! | |
808 vmla.f32 d23, d2, d6 | |
809 vld1.64 {d18,d19},[r2,:128], r5 | |
810 vmls.f32 d20, d3, d4 | |
811 vld1.64 {d24,d25},[r3,:128]! | |
812 vmls.f32 d21, d2, d5 | |
813 vld1.64 {d6,d7}, [r4,:128], r5 | |
814 vmov q1, q9 | |
815 vrev64.32 q11, q11 | |
816 vmov q2, q12 | |
817 vswp d22, d23 | |
818 vst1.64 {d20,d21},[r0,:128]! | |
819 vst1.64 {d22,d23},[ip,:128], r5 | |
820 b 1b | |
821 2: vmla.f32 d22, d3, d7 | |
822 vmla.f32 d23, d2, d6 | |
823 vmls.f32 d20, d3, d4 | |
824 vmls.f32 d21, d2, d5 | |
825 vrev64.32 q11, q11 | |
826 vswp d22, d23 | |
827 vst1.64 {d20,d21},[r0,:128]! | |
828 vst1.64 {d22,d23},[ip,:128], r5 | |
829 pop {r4,r5,pc} | |
11443 | 830 endfunc |
10046 | 831 |
832 #if CONFIG_VORBIS_DECODER | |
833 function ff_vorbis_inverse_coupling_neon, export=1 | |
834 vmov.i32 q10, #1<<31 | |
835 subs r2, r2, #4 | |
836 mov r3, r0 | |
837 mov r12, r1 | |
838 beq 3f | |
839 | |
840 vld1.32 {d24-d25},[r1,:128]! | |
841 vld1.32 {d22-d23},[r0,:128]! | |
842 vcle.s32 q8, q12, #0 | |
843 vand q9, q11, q10 | |
844 veor q12, q12, q9 | |
845 vand q2, q12, q8 | |
846 vbic q3, q12, q8 | |
847 vadd.f32 q12, q11, q2 | |
848 vsub.f32 q11, q11, q3 | |
849 1: vld1.32 {d2-d3}, [r1,:128]! | |
850 vld1.32 {d0-d1}, [r0,:128]! | |
851 vcle.s32 q8, q1, #0 | |
852 vand q9, q0, q10 | |
853 veor q1, q1, q9 | |
854 vst1.32 {d24-d25},[r3, :128]! | |
855 vst1.32 {d22-d23},[r12,:128]! | |
856 vand q2, q1, q8 | |
857 vbic q3, q1, q8 | |
858 vadd.f32 q1, q0, q2 | |
859 vsub.f32 q0, q0, q3 | |
860 subs r2, r2, #8 | |
861 ble 2f | |
862 vld1.32 {d24-d25},[r1,:128]! | |
863 vld1.32 {d22-d23},[r0,:128]! | |
864 vcle.s32 q8, q12, #0 | |
865 vand q9, q11, q10 | |
866 veor q12, q12, q9 | |
867 vst1.32 {d2-d3}, [r3, :128]! | |
868 vst1.32 {d0-d1}, [r12,:128]! | |
869 vand q2, q12, q8 | |
870 vbic q3, q12, q8 | |
871 vadd.f32 q12, q11, q2 | |
872 vsub.f32 q11, q11, q3 | |
873 b 1b | |
874 | |
875 2: vst1.32 {d2-d3}, [r3, :128]! | |
876 vst1.32 {d0-d1}, [r12,:128]! | |
877 bxlt lr | |
878 | |
879 3: vld1.32 {d2-d3}, [r1,:128] | |
880 vld1.32 {d0-d1}, [r0,:128] | |
881 vcle.s32 q8, q1, #0 | |
882 vand q9, q0, q10 | |
883 veor q1, q1, q9 | |
884 vand q2, q1, q8 | |
885 vbic q3, q1, q8 | |
886 vadd.f32 q1, q0, q2 | |
887 vsub.f32 q0, q0, q3 | |
888 vst1.32 {d2-d3}, [r0,:128]! | |
889 vst1.32 {d0-d1}, [r1,:128]! | |
890 bx lr | |
11443 | 891 endfunc |
10046 | 892 #endif |
10221 | 893 |
894 function ff_vector_fmul_scalar_neon, export=1 | |
895 VFP len .req r2 | |
896 NOVFP len .req r3 | |
897 VFP vdup.32 q8, d0[0] | |
898 NOVFP vdup.32 q8, r2 | |
899 bics r12, len, #15 | |
900 beq 3f | |
901 vld1.32 {q0},[r1,:128]! | |
902 vld1.32 {q1},[r1,:128]! | |
903 1: vmul.f32 q0, q0, q8 | |
904 vld1.32 {q2},[r1,:128]! | |
905 vmul.f32 q1, q1, q8 | |
906 vld1.32 {q3},[r1,:128]! | |
907 vmul.f32 q2, q2, q8 | |
908 vst1.32 {q0},[r0,:128]! | |
909 vmul.f32 q3, q3, q8 | |
910 vst1.32 {q1},[r0,:128]! | |
911 subs r12, r12, #16 | |
912 beq 2f | |
913 vld1.32 {q0},[r1,:128]! | |
914 vst1.32 {q2},[r0,:128]! | |
915 vld1.32 {q1},[r1,:128]! | |
916 vst1.32 {q3},[r0,:128]! | |
917 b 1b | |
918 2: vst1.32 {q2},[r0,:128]! | |
919 vst1.32 {q3},[r0,:128]! | |
920 ands len, len, #15 | |
921 bxeq lr | |
922 3: vld1.32 {q0},[r1,:128]! | |
923 vmul.f32 q0, q0, q8 | |
924 vst1.32 {q0},[r0,:128]! | |
925 subs len, len, #4 | |
926 bgt 3b | |
927 bx lr | |
928 .unreq len | |
11443 | 929 endfunc |
10221 | 930 |
931 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
932 VFP vdup.32 d16, d0[0] | |
933 NOVFP vdup.32 d16, r3 | |
934 NOVFP ldr r3, [sp] | |
935 vld1.32 {d0},[r1,:64]! | |
936 vld1.32 {d1},[r1,:64]! | |
937 1: subs r3, r3, #4 | |
938 vmul.f32 d4, d0, d16 | |
939 vmul.f32 d5, d1, d16 | |
940 ldr r12, [r2], #4 | |
941 vld1.32 {d2},[r12,:64] | |
942 ldr r12, [r2], #4 | |
943 vld1.32 {d3},[r12,:64] | |
944 vmul.f32 d4, d4, d2 | |
945 vmul.f32 d5, d5, d3 | |
946 beq 2f | |
947 vld1.32 {d0},[r1,:64]! | |
948 vld1.32 {d1},[r1,:64]! | |
949 vst1.32 {d4},[r0,:64]! | |
950 vst1.32 {d5},[r0,:64]! | |
951 b 1b | |
952 2: vst1.32 {d4},[r0,:64]! | |
953 vst1.32 {d5},[r0,:64]! | |
954 bx lr | |
11443 | 955 endfunc |
10221 | 956 |
957 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
958 VFP vdup.32 q10, d0[0] | |
959 NOVFP vdup.32 q10, r3 | |
960 NOVFP ldr r3, [sp] | |
961 push {lr} | |
962 bics lr, r3, #7 | |
963 beq 3f | |
964 vld1.32 {q0},[r1,:128]! | |
965 vld1.32 {q2},[r1,:128]! | |
966 1: ldr r12, [r2], #4 | |
967 vld1.32 {q1},[r12,:128] | |
968 ldr r12, [r2], #4 | |
969 vld1.32 {q3},[r12,:128] | |
970 vmul.f32 q8, q0, q10 | |
971 vmul.f32 q8, q8, q1 | |
972 vmul.f32 q9, q2, q10 | |
973 vmul.f32 q9, q9, q3 | |
974 subs lr, lr, #8 | |
975 beq 2f | |
976 vld1.32 {q0},[r1,:128]! | |
977 vld1.32 {q2},[r1,:128]! | |
978 vst1.32 {q8},[r0,:128]! | |
979 vst1.32 {q9},[r0,:128]! | |
980 b 1b | |
981 2: vst1.32 {q8},[r0,:128]! | |
982 vst1.32 {q9},[r0,:128]! | |
983 ands r3, r3, #7 | |
984 popeq {pc} | |
985 3: vld1.32 {q0},[r1,:128]! | |
986 ldr r12, [r2], #4 | |
987 vld1.32 {q1},[r12,:128] | |
988 vmul.f32 q0, q0, q10 | |
989 vmul.f32 q0, q0, q1 | |
990 vst1.32 {q0},[r0,:128]! | |
991 subs r3, r3, #4 | |
992 bgt 3b | |
993 pop {pc} | |
11443 | 994 endfunc |
10221 | 995 |
996 function ff_sv_fmul_scalar_2_neon, export=1 | |
997 VFP len .req r2 | |
998 NOVFP len .req r3 | |
999 VFP vdup.32 q8, d0[0] | |
1000 NOVFP vdup.32 q8, r2 | |
1001 ldr r12, [r1], #4 | |
1002 vld1.32 {d0},[r12,:64] | |
1003 ldr r12, [r1], #4 | |
1004 vld1.32 {d1},[r12,:64] | |
1005 1: vmul.f32 q1, q0, q8 | |
1006 subs len, len, #4 | |
1007 beq 2f | |
1008 ldr r12, [r1], #4 | |
1009 vld1.32 {d0},[r12,:64] | |
1010 ldr r12, [r1], #4 | |
1011 vld1.32 {d1},[r12,:64] | |
1012 vst1.32 {q1},[r0,:128]! | |
1013 b 1b | |
1014 2: vst1.32 {q1},[r0,:128]! | |
1015 bx lr | |
1016 .unreq len | |
11443 | 1017 endfunc |
10221 | 1018 |
1019 function ff_sv_fmul_scalar_4_neon, export=1 | |
1020 VFP len .req r2 | |
1021 NOVFP len .req r3 | |
1022 VFP vdup.32 q8, d0[0] | |
1023 NOVFP vdup.32 q8, r2 | |
1024 1: ldr r12, [r1], #4 | |
1025 vld1.32 {q0},[r12,:128] | |
1026 vmul.f32 q0, q0, q8 | |
1027 vst1.32 {q0},[r0,:128]! | |
1028 subs len, len, #4 | |
1029 bgt 1b | |
1030 bx lr | |
1031 .unreq len | |
11443 | 1032 endfunc |
10221 | 1033 |
1034 function ff_butterflies_float_neon, export=1 | |
1035 1: vld1.32 {q0},[r0,:128] | |
1036 vld1.32 {q1},[r1,:128] | |
1037 vsub.f32 q2, q0, q1 | |
1038 vadd.f32 q1, q0, q1 | |
1039 vst1.32 {q2},[r1,:128]! | |
1040 vst1.32 {q1},[r0,:128]! | |
1041 subs r2, r2, #4 | |
1042 bgt 1b | |
1043 bx lr | |
11443 | 1044 endfunc |
10228 | 1045 |
1046 function ff_scalarproduct_float_neon, export=1 | |
1047 vmov.f32 q2, #0.0 | |
1048 1: vld1.32 {q0},[r0,:128]! | |
1049 vld1.32 {q1},[r1,:128]! | |
1050 vmla.f32 q2, q0, q1 | |
1051 subs r2, r2, #4 | |
1052 bgt 1b | |
1053 vadd.f32 d0, d4, d5 | |
1054 vpadd.f32 d0, d0, d0 | |
1055 NOVFP vmov.32 r0, d0[0] | |
1056 bx lr | |
11443 | 1057 endfunc |
10253 | 1058 |
1059 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1060 VFP vdup.32 q0, d0[0] | |
1061 VFP len .req r2 | |
1062 NOVFP vdup.32 q0, r2 | |
1063 NOVFP len .req r3 | |
1064 | |
1065 vld1.32 {q1},[r1,:128]! | |
1066 vcvt.f32.s32 q3, q1 | |
1067 vld1.32 {q2},[r1,:128]! | |
1068 vcvt.f32.s32 q8, q2 | |
1069 1: subs len, len, #8 | |
1070 pld [r1, #16] | |
1071 vmul.f32 q9, q3, q0 | |
1072 vmul.f32 q10, q8, q0 | |
1073 beq 2f | |
1074 vld1.32 {q1},[r1,:128]! | |
1075 vcvt.f32.s32 q3, q1 | |
1076 vld1.32 {q2},[r1,:128]! | |
1077 vcvt.f32.s32 q8, q2 | |
1078 vst1.32 {q9}, [r0,:128]! | |
1079 vst1.32 {q10},[r0,:128]! | |
1080 b 1b | |
1081 2: vst1.32 {q9}, [r0,:128]! | |
1082 vst1.32 {q10},[r0,:128]! | |
1083 bx lr | |
1084 .unreq len | |
11443 | 1085 endfunc |
10274 | 1086 |
1087 function ff_vector_fmul_reverse_neon, export=1 | |
1088 add r2, r2, r3, lsl #2 | |
1089 sub r2, r2, #32 | |
1090 mov r12, #-32 | |
1091 vld1.32 {q0-q1}, [r1,:128]! | |
1092 vld1.32 {q2-q3}, [r2,:128], r12 | |
1093 1: pld [r1, #32] | |
1094 vrev64.32 q3, q3 | |
1095 vmul.f32 d16, d0, d7 | |
1096 vmul.f32 d17, d1, d6 | |
1097 pld [r2, #-32] | |
1098 vrev64.32 q2, q2 | |
1099 vmul.f32 d18, d2, d5 | |
1100 vmul.f32 d19, d3, d4 | |
1101 subs r3, r3, #8 | |
1102 beq 2f | |
1103 vld1.32 {q0-q1}, [r1,:128]! | |
1104 vld1.32 {q2-q3}, [r2,:128], r12 | |
1105 vst1.32 {q8-q9}, [r0,:128]! | |
1106 b 1b | |
1107 2: vst1.32 {q8-q9}, [r0,:128]! | |
1108 bx lr | |
11443 | 1109 endfunc |
10276 | 1110 |
10302 | 1111 function ff_vector_fmul_add_neon, export=1 |
1112 ldr r12, [sp] | |
1113 vld1.32 {q0-q1}, [r1,:128]! | |
1114 vld1.32 {q8-q9}, [r2,:128]! | |
1115 vld1.32 {q2-q3}, [r3,:128]! | |
1116 vmul.f32 q10, q0, q8 | |
1117 vmul.f32 q11, q1, q9 | |
1118 1: vadd.f32 q12, q2, q10 | |
1119 vadd.f32 q13, q3, q11 | |
1120 pld [r1, #16] | |
1121 pld [r2, #16] | |
1122 pld [r3, #16] | |
1123 subs r12, r12, #8 | |
1124 beq 2f | |
1125 vld1.32 {q0}, [r1,:128]! | |
1126 vld1.32 {q8}, [r2,:128]! | |
1127 vmul.f32 q10, q0, q8 | |
1128 vld1.32 {q1}, [r1,:128]! | |
1129 vld1.32 {q9}, [r2,:128]! | |
1130 vmul.f32 q11, q1, q9 | |
1131 vld1.32 {q2-q3}, [r3,:128]! | |
1132 vst1.32 {q12-q13},[r0,:128]! | |
1133 b 1b | |
1134 2: vst1.32 {q12-q13},[r0,:128]! | |
1135 bx lr | |
11443 | 1136 endfunc |
10302 | 1137 |
10276 | 1138 function ff_vector_clipf_neon, export=1 |
1139 VFP vdup.32 q1, d0[1] | |
1140 VFP vdup.32 q0, d0[0] | |
1141 NOVFP vdup.32 q0, r2 | |
1142 NOVFP vdup.32 q1, r3 | |
1143 NOVFP ldr r2, [sp] | |
1144 vld1.f32 {q2},[r1,:128]! | |
1145 vmin.f32 q10, q2, q1 | |
1146 vld1.f32 {q3},[r1,:128]! | |
1147 vmin.f32 q11, q3, q1 | |
1148 1: vmax.f32 q8, q10, q0 | |
1149 vmax.f32 q9, q11, q0 | |
1150 subs r2, r2, #8 | |
1151 beq 2f | |
1152 vld1.f32 {q2},[r1,:128]! | |
1153 vmin.f32 q10, q2, q1 | |
1154 vld1.f32 {q3},[r1,:128]! | |
1155 vmin.f32 q11, q3, q1 | |
1156 vst1.f32 {q8},[r0,:128]! | |
1157 vst1.f32 {q9},[r0,:128]! | |
1158 b 1b | |
1159 2: vst1.f32 {q8},[r0,:128]! | |
1160 vst1.f32 {q9},[r0,:128]! | |
1161 bx lr | |
11443 | 1162 endfunc |