Mercurial > libavcodec.hg
annotate arm/dsputil_neon.S @ 11557:53822d92c3f7 libavcodec
Make sure the EC code does not attempt to use inter based concealment if there
is no reference frame available. (this can happen because the EC code will attempt
to use reference frames even for I/IDR frames)
author | michael |
---|---|
date | Tue, 30 Mar 2010 20:46:46 +0000 |
parents | 361a5fcb4393 |
children | 659f16d04776 |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
10046 | 22 #include "config.h" |
8334 | 23 #include "asm.S" |
24 | |
25 preserve8 | |
26 .text | |
27 | |
28 .macro pixels16 avg=0 | |
29 .if \avg | |
30 mov ip, r0 | |
31 .endif | |
32 1: vld1.64 {d0, d1}, [r1], r2 | |
33 vld1.64 {d2, d3}, [r1], r2 | |
34 vld1.64 {d4, d5}, [r1], r2 | |
35 pld [r1, r2, lsl #2] | |
36 vld1.64 {d6, d7}, [r1], r2 | |
37 pld [r1] | |
38 pld [r1, r2] | |
39 pld [r1, r2, lsl #1] | |
40 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
41 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 42 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
43 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 44 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
45 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 46 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
47 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 48 vrhadd.u8 q3, q3, q11 |
49 .endif | |
50 subs r3, r3, #4 | |
51 vst1.64 {d0, d1}, [r0,:128], r2 | |
52 vst1.64 {d2, d3}, [r0,:128], r2 | |
53 vst1.64 {d4, d5}, [r0,:128], r2 | |
54 vst1.64 {d6, d7}, [r0,:128], r2 | |
55 bne 1b | |
56 bx lr | |
57 .endm | |
58 | |
59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
60 1: vld1.64 {d0-d2}, [r1], r2 | |
61 vld1.64 {d4-d6}, [r1], r2 | |
62 pld [r1] | |
63 pld [r1, r2] | |
64 subs r3, r3, #2 | |
65 vext.8 q1, q0, q1, #1 | |
66 \vhadd q0, q0, q1 | |
67 vext.8 q3, q2, q3, #1 | |
68 \vhadd q2, q2, q3 | |
69 vst1.64 {d0, d1}, [r0,:128], r2 | |
70 vst1.64 {d4, d5}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
76 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
77 vld1.64 {d2, d3}, [r1], r2 |
8334 | 78 1: subs r3, r3, #2 |
79 \vhadd q2, q0, q1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
80 vld1.64 {d0, d1}, [r1], r2 |
8334 | 81 \vhadd q3, q0, q1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
82 vld1.64 {d2, d3}, [r1], r2 |
8334 | 83 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
84 pld [r1, r2] |
8334 | 85 vst1.64 {d4, d5}, [r0,:128], r2 |
86 vst1.64 {d6, d7}, [r0,:128], r2 | |
87 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
88 bx lr |
8334 | 89 .endm |
90 | |
91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
92 vld1.64 {d0-d2}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d4-d6}, [r1], r2 |
8334 | 94 .if \no_rnd |
95 vmov.i16 q13, #1 | |
96 .endif | |
97 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
98 pld [r1, r2] |
8334 | 99 vext.8 q1, q0, q1, #1 |
100 vext.8 q3, q2, q3, #1 | |
101 vaddl.u8 q8, d0, d2 | |
102 vaddl.u8 q10, d1, d3 | |
103 vaddl.u8 q9, d4, d6 | |
104 vaddl.u8 q11, d5, d7 | |
105 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
106 vld1.64 {d0-d2}, [r1], r2 |
8334 | 107 vadd.u16 q12, q8, q9 |
108 pld [r1] | |
109 .if \no_rnd | |
110 vadd.u16 q12, q12, q13 | |
111 .endif | |
112 vext.8 q15, q0, q1, #1 | |
113 vadd.u16 q1 , q10, q11 | |
114 \vshrn d28, q12, #2 | |
115 .if \no_rnd | |
116 vadd.u16 q1, q1, q13 | |
117 .endif | |
118 \vshrn d29, q1, #2 | |
119 vaddl.u8 q8, d0, d30 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
120 vld1.64 {d2-d4}, [r1], r2 |
8334 | 121 vaddl.u8 q10, d1, d31 |
122 vst1.64 {d28,d29}, [r0,:128], r2 | |
123 vadd.u16 q12, q8, q9 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
124 pld [r1, r2] |
8334 | 125 .if \no_rnd |
126 vadd.u16 q12, q12, q13 | |
127 .endif | |
128 vext.8 q2, q1, q2, #1 | |
129 vadd.u16 q0, q10, q11 | |
130 \vshrn d30, q12, #2 | |
131 .if \no_rnd | |
132 vadd.u16 q0, q0, q13 | |
133 .endif | |
134 \vshrn d31, q0, #2 | |
135 vaddl.u8 q9, d2, d4 | |
136 vaddl.u8 q11, d3, d5 | |
137 vst1.64 {d30,d31}, [r0,:128], r2 | |
138 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
139 bx lr |
8334 | 140 .endm |
141 | |
10375 | 142 .macro pixels8 avg=0 |
8334 | 143 1: vld1.64 {d0}, [r1], r2 |
144 vld1.64 {d1}, [r1], r2 | |
145 vld1.64 {d2}, [r1], r2 | |
146 pld [r1, r2, lsl #2] | |
147 vld1.64 {d3}, [r1], r2 | |
148 pld [r1] | |
149 pld [r1, r2] | |
150 pld [r1, r2, lsl #1] | |
10375 | 151 .if \avg |
152 vld1.64 {d4}, [r0,:64], r2 | |
153 vrhadd.u8 d0, d0, d4 | |
154 vld1.64 {d5}, [r0,:64], r2 | |
155 vrhadd.u8 d1, d1, d5 | |
156 vld1.64 {d6}, [r0,:64], r2 | |
157 vrhadd.u8 d2, d2, d6 | |
158 vld1.64 {d7}, [r0,:64], r2 | |
159 vrhadd.u8 d3, d3, d7 | |
160 sub r0, r0, r2, lsl #2 | |
161 .endif | |
8334 | 162 subs r3, r3, #4 |
163 vst1.64 {d0}, [r0,:64], r2 | |
164 vst1.64 {d1}, [r0,:64], r2 | |
165 vst1.64 {d2}, [r0,:64], r2 | |
166 vst1.64 {d3}, [r0,:64], r2 | |
167 bne 1b | |
168 bx lr | |
169 .endm | |
170 | |
171 .macro pixels8_x2 vhadd=vrhadd.u8 | |
172 1: vld1.64 {d0, d1}, [r1], r2 | |
173 vext.8 d1, d0, d1, #1 | |
174 vld1.64 {d2, d3}, [r1], r2 | |
175 vext.8 d3, d2, d3, #1 | |
176 pld [r1] | |
177 pld [r1, r2] | |
178 subs r3, r3, #2 | |
179 vswp d1, d2 | |
180 \vhadd q0, q0, q1 | |
181 vst1.64 {d0}, [r0,:64], r2 | |
182 vst1.64 {d1}, [r0,:64], r2 | |
183 bne 1b | |
184 bx lr | |
185 .endm | |
186 | |
187 .macro pixels8_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
188 vld1.64 {d0}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
189 vld1.64 {d1}, [r1], r2 |
8334 | 190 1: subs r3, r3, #2 |
191 \vhadd d4, d0, d1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
192 vld1.64 {d0}, [r1], r2 |
8334 | 193 \vhadd d5, d0, d1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
194 vld1.64 {d1}, [r1], r2 |
8334 | 195 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
196 pld [r1, r2] |
8334 | 197 vst1.64 {d4}, [r0,:64], r2 |
198 vst1.64 {d5}, [r0,:64], r2 | |
199 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
200 bx lr |
8334 | 201 .endm |
202 | |
203 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
204 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
205 vld1.64 {d2, d3}, [r1], r2 |
8334 | 206 .if \no_rnd |
207 vmov.i16 q11, #1 | |
208 .endif | |
209 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
210 pld [r1, r2] |
8334 | 211 vext.8 d4, d0, d1, #1 |
212 vext.8 d6, d2, d3, #1 | |
213 vaddl.u8 q8, d0, d4 | |
214 vaddl.u8 q9, d2, d6 | |
215 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
216 vld1.64 {d0, d1}, [r1], r2 |
8334 | 217 pld [r1] |
218 vadd.u16 q10, q8, q9 | |
219 vext.8 d4, d0, d1, #1 | |
220 .if \no_rnd | |
221 vadd.u16 q10, q10, q11 | |
222 .endif | |
223 vaddl.u8 q8, d0, d4 | |
224 \vshrn d5, q10, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
225 vld1.64 {d2, d3}, [r1], r2 |
8334 | 226 vadd.u16 q10, q8, q9 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
227 pld [r1, r2] |
8334 | 228 .if \no_rnd |
229 vadd.u16 q10, q10, q11 | |
230 .endif | |
231 vst1.64 {d5}, [r0,:64], r2 | |
232 \vshrn d7, q10, #2 | |
233 vext.8 d6, d2, d3, #1 | |
234 vaddl.u8 q9, d2, d6 | |
235 vst1.64 {d7}, [r0,:64], r2 | |
236 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
237 bx lr |
8334 | 238 .endm |
239 | |
240 .macro pixfunc pfx name suf rnd_op args:vararg | |
241 function ff_\pfx\name\suf\()_neon, export=1 | |
242 \name \rnd_op \args | |
11443 | 243 endfunc |
8334 | 244 .endm |
245 | |
246 .macro pixfunc2 pfx name args:vararg | |
247 pixfunc \pfx \name | |
248 pixfunc \pfx \name \args | |
249 .endm | |
250 | |
251 function ff_put_h264_qpel16_mc00_neon, export=1 | |
10376 | 252 mov r3, #16 |
11443 | 253 endfunc |
8334 | 254 |
255 pixfunc put_ pixels16 | |
256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
259 | |
260 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
10376 | 261 mov r3, #16 |
11443 | 262 endfunc |
8334 | 263 |
264 pixfunc avg_ pixels16,, 1 | |
265 | |
266 function ff_put_h264_qpel8_mc00_neon, export=1 | |
10376 | 267 mov r3, #8 |
11443 | 268 endfunc |
8334 | 269 |
270 pixfunc put_ pixels8 | |
271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 274 |
10375 | 275 function ff_avg_h264_qpel8_mc00_neon, export=1 |
276 mov r3, #8 | |
11443 | 277 endfunc |
10375 | 278 |
279 pixfunc avg_ pixels8,, 1 | |
280 | |
9580 | 281 function ff_put_pixels_clamped_neon, export=1 |
282 vld1.64 {d16-d19}, [r0,:128]! | |
283 vqmovun.s16 d0, q8 | |
284 vld1.64 {d20-d23}, [r0,:128]! | |
285 vqmovun.s16 d1, q9 | |
286 vld1.64 {d24-d27}, [r0,:128]! | |
287 vqmovun.s16 d2, q10 | |
288 vld1.64 {d28-d31}, [r0,:128]! | |
289 vqmovun.s16 d3, q11 | |
290 vst1.64 {d0}, [r1,:64], r2 | |
291 vqmovun.s16 d4, q12 | |
292 vst1.64 {d1}, [r1,:64], r2 | |
293 vqmovun.s16 d5, q13 | |
294 vst1.64 {d2}, [r1,:64], r2 | |
295 vqmovun.s16 d6, q14 | |
296 vst1.64 {d3}, [r1,:64], r2 | |
297 vqmovun.s16 d7, q15 | |
298 vst1.64 {d4}, [r1,:64], r2 | |
299 vst1.64 {d5}, [r1,:64], r2 | |
300 vst1.64 {d6}, [r1,:64], r2 | |
301 vst1.64 {d7}, [r1,:64], r2 | |
302 bx lr | |
11443 | 303 endfunc |
9580 | 304 |
9345 | 305 function ff_put_signed_pixels_clamped_neon, export=1 |
306 vmov.u8 d31, #128 | |
307 vld1.64 {d16-d17}, [r0,:128]! | |
308 vqmovn.s16 d0, q8 | |
309 vld1.64 {d18-d19}, [r0,:128]! | |
310 vqmovn.s16 d1, q9 | |
311 vld1.64 {d16-d17}, [r0,:128]! | |
312 vqmovn.s16 d2, q8 | |
313 vld1.64 {d18-d19}, [r0,:128]! | |
314 vadd.u8 d0, d0, d31 | |
315 vld1.64 {d20-d21}, [r0,:128]! | |
316 vadd.u8 d1, d1, d31 | |
317 vld1.64 {d22-d23}, [r0,:128]! | |
318 vadd.u8 d2, d2, d31 | |
319 vst1.64 {d0}, [r1,:64], r2 | |
320 vqmovn.s16 d3, q9 | |
321 vst1.64 {d1}, [r1,:64], r2 | |
322 vqmovn.s16 d4, q10 | |
323 vst1.64 {d2}, [r1,:64], r2 | |
324 vqmovn.s16 d5, q11 | |
325 vld1.64 {d24-d25}, [r0,:128]! | |
326 vadd.u8 d3, d3, d31 | |
327 vld1.64 {d26-d27}, [r0,:128]! | |
328 vadd.u8 d4, d4, d31 | |
329 vadd.u8 d5, d5, d31 | |
330 vst1.64 {d3}, [r1,:64], r2 | |
331 vqmovn.s16 d6, q12 | |
332 vst1.64 {d4}, [r1,:64], r2 | |
333 vqmovn.s16 d7, q13 | |
334 vst1.64 {d5}, [r1,:64], r2 | |
335 vadd.u8 d6, d6, d31 | |
336 vadd.u8 d7, d7, d31 | |
337 vst1.64 {d6}, [r1,:64], r2 | |
338 vst1.64 {d7}, [r1,:64], r2 | |
339 bx lr | |
11443 | 340 endfunc |
9345 | 341 |
9344 | 342 function ff_add_pixels_clamped_neon, export=1 |
343 mov r3, r1 | |
344 vld1.64 {d16}, [r1,:64], r2 | |
345 vld1.64 {d0-d1}, [r0,:128]! | |
346 vaddw.u8 q0, q0, d16 | |
347 vld1.64 {d17}, [r1,:64], r2 | |
348 vld1.64 {d2-d3}, [r0,:128]! | |
349 vqmovun.s16 d0, q0 | |
350 vld1.64 {d18}, [r1,:64], r2 | |
351 vaddw.u8 q1, q1, d17 | |
352 vld1.64 {d4-d5}, [r0,:128]! | |
353 vaddw.u8 q2, q2, d18 | |
354 vst1.64 {d0}, [r3,:64], r2 | |
355 vqmovun.s16 d2, q1 | |
356 vld1.64 {d19}, [r1,:64], r2 | |
357 vld1.64 {d6-d7}, [r0,:128]! | |
358 vaddw.u8 q3, q3, d19 | |
359 vqmovun.s16 d4, q2 | |
360 vst1.64 {d2}, [r3,:64], r2 | |
361 vld1.64 {d16}, [r1,:64], r2 | |
362 vqmovun.s16 d6, q3 | |
363 vld1.64 {d0-d1}, [r0,:128]! | |
364 vaddw.u8 q0, q0, d16 | |
365 vst1.64 {d4}, [r3,:64], r2 | |
366 vld1.64 {d17}, [r1,:64], r2 | |
367 vld1.64 {d2-d3}, [r0,:128]! | |
368 vaddw.u8 q1, q1, d17 | |
369 vst1.64 {d6}, [r3,:64], r2 | |
370 vqmovun.s16 d0, q0 | |
371 vld1.64 {d18}, [r1,:64], r2 | |
372 vld1.64 {d4-d5}, [r0,:128]! | |
373 vaddw.u8 q2, q2, d18 | |
374 vst1.64 {d0}, [r3,:64], r2 | |
375 vqmovun.s16 d2, q1 | |
376 vld1.64 {d19}, [r1,:64], r2 | |
377 vqmovun.s16 d4, q2 | |
378 vld1.64 {d6-d7}, [r0,:128]! | |
379 vaddw.u8 q3, q3, d19 | |
380 vst1.64 {d2}, [r3,:64], r2 | |
381 vqmovun.s16 d6, q3 | |
382 vst1.64 {d4}, [r3,:64], r2 | |
383 vst1.64 {d6}, [r3,:64], r2 | |
384 bx lr | |
11443 | 385 endfunc |
9344 | 386 |
8492 | 387 function ff_float_to_int16_neon, export=1 |
388 subs r2, r2, #8 | |
389 vld1.64 {d0-d1}, [r1,:128]! | |
390 vcvt.s32.f32 q8, q0, #16 | |
391 vld1.64 {d2-d3}, [r1,:128]! | |
392 vcvt.s32.f32 q9, q1, #16 | |
393 beq 3f | |
394 bics ip, r2, #15 | |
395 beq 2f | |
396 1: subs ip, ip, #16 | |
397 vshrn.s32 d4, q8, #16 | |
398 vld1.64 {d0-d1}, [r1,:128]! | |
399 vcvt.s32.f32 q0, q0, #16 | |
400 vshrn.s32 d5, q9, #16 | |
401 vld1.64 {d2-d3}, [r1,:128]! | |
402 vcvt.s32.f32 q1, q1, #16 | |
403 vshrn.s32 d6, q0, #16 | |
404 vst1.64 {d4-d5}, [r0,:128]! | |
405 vshrn.s32 d7, q1, #16 | |
406 vld1.64 {d16-d17},[r1,:128]! | |
407 vcvt.s32.f32 q8, q8, #16 | |
408 vld1.64 {d18-d19},[r1,:128]! | |
409 vcvt.s32.f32 q9, q9, #16 | |
410 vst1.64 {d6-d7}, [r0,:128]! | |
411 bne 1b | |
412 ands r2, r2, #15 | |
413 beq 3f | |
414 2: vld1.64 {d0-d1}, [r1,:128]! | |
415 vshrn.s32 d4, q8, #16 | |
416 vcvt.s32.f32 q0, q0, #16 | |
417 vld1.64 {d2-d3}, [r1,:128]! | |
418 vshrn.s32 d5, q9, #16 | |
419 vcvt.s32.f32 q1, q1, #16 | |
420 vshrn.s32 d6, q0, #16 | |
421 vst1.64 {d4-d5}, [r0,:128]! | |
422 vshrn.s32 d7, q1, #16 | |
423 vst1.64 {d6-d7}, [r0,:128]! | |
424 bx lr | |
425 3: vshrn.s32 d4, q8, #16 | |
426 vshrn.s32 d5, q9, #16 | |
427 vst1.64 {d4-d5}, [r0,:128]! | |
428 bx lr | |
11443 | 429 endfunc |
8492 | 430 |
431 function ff_float_to_int16_interleave_neon, export=1 | |
432 cmp r3, #2 | |
433 ldrlt r1, [r1] | |
434 blt ff_float_to_int16_neon | |
435 bne 4f | |
436 | |
437 ldr r3, [r1] | |
438 ldr r1, [r1, #4] | |
439 | |
440 subs r2, r2, #8 | |
441 vld1.64 {d0-d1}, [r3,:128]! | |
442 vcvt.s32.f32 q8, q0, #16 | |
443 vld1.64 {d2-d3}, [r3,:128]! | |
444 vcvt.s32.f32 q9, q1, #16 | |
445 vld1.64 {d20-d21},[r1,:128]! | |
446 vcvt.s32.f32 q10, q10, #16 | |
447 vld1.64 {d22-d23},[r1,:128]! | |
448 vcvt.s32.f32 q11, q11, #16 | |
449 beq 3f | |
450 bics ip, r2, #15 | |
451 beq 2f | |
452 1: subs ip, ip, #16 | |
453 vld1.64 {d0-d1}, [r3,:128]! | |
454 vcvt.s32.f32 q0, q0, #16 | |
455 vsri.32 q10, q8, #16 | |
456 vld1.64 {d2-d3}, [r3,:128]! | |
457 vcvt.s32.f32 q1, q1, #16 | |
458 vld1.64 {d24-d25},[r1,:128]! | |
459 vcvt.s32.f32 q12, q12, #16 | |
460 vld1.64 {d26-d27},[r1,:128]! | |
461 vsri.32 q11, q9, #16 | |
462 vst1.64 {d20-d21},[r0,:128]! | |
463 vcvt.s32.f32 q13, q13, #16 | |
464 vst1.64 {d22-d23},[r0,:128]! | |
465 vsri.32 q12, q0, #16 | |
466 vld1.64 {d16-d17},[r3,:128]! | |
467 vsri.32 q13, q1, #16 | |
468 vst1.64 {d24-d25},[r0,:128]! | |
469 vcvt.s32.f32 q8, q8, #16 | |
470 vld1.64 {d18-d19},[r3,:128]! | |
471 vcvt.s32.f32 q9, q9, #16 | |
472 vld1.64 {d20-d21},[r1,:128]! | |
473 vcvt.s32.f32 q10, q10, #16 | |
474 vld1.64 {d22-d23},[r1,:128]! | |
475 vcvt.s32.f32 q11, q11, #16 | |
476 vst1.64 {d26-d27},[r0,:128]! | |
477 bne 1b | |
478 ands r2, r2, #15 | |
479 beq 3f | |
480 2: vsri.32 q10, q8, #16 | |
481 vld1.64 {d0-d1}, [r3,:128]! | |
482 vcvt.s32.f32 q0, q0, #16 | |
483 vld1.64 {d2-d3}, [r3,:128]! | |
484 vcvt.s32.f32 q1, q1, #16 | |
485 vld1.64 {d24-d25},[r1,:128]! | |
486 vcvt.s32.f32 q12, q12, #16 | |
487 vsri.32 q11, q9, #16 | |
488 vld1.64 {d26-d27},[r1,:128]! | |
489 vcvt.s32.f32 q13, q13, #16 | |
490 vst1.64 {d20-d21},[r0,:128]! | |
491 vsri.32 q12, q0, #16 | |
492 vst1.64 {d22-d23},[r0,:128]! | |
493 vsri.32 q13, q1, #16 | |
494 vst1.64 {d24-d27},[r0,:128]! | |
495 bx lr | |
496 3: vsri.32 q10, q8, #16 | |
497 vsri.32 q11, q9, #16 | |
498 vst1.64 {d20-d23},[r0,:128]! | |
499 bx lr | |
500 | |
501 4: push {r4-r8,lr} | |
502 cmp r3, #4 | |
503 lsl ip, r3, #1 | |
504 blt 4f | |
505 | |
506 @ 4 channels | |
507 5: ldmia r1!, {r4-r7} | |
508 mov lr, r2 | |
509 mov r8, r0 | |
510 vld1.64 {d16-d17},[r4,:128]! | |
511 vcvt.s32.f32 q8, q8, #16 | |
512 vld1.64 {d18-d19},[r5,:128]! | |
513 vcvt.s32.f32 q9, q9, #16 | |
514 vld1.64 {d20-d21},[r6,:128]! | |
515 vcvt.s32.f32 q10, q10, #16 | |
516 vld1.64 {d22-d23},[r7,:128]! | |
517 vcvt.s32.f32 q11, q11, #16 | |
518 6: subs lr, lr, #8 | |
519 vld1.64 {d0-d1}, [r4,:128]! | |
520 vcvt.s32.f32 q0, q0, #16 | |
521 vsri.32 q9, q8, #16 | |
522 vld1.64 {d2-d3}, [r5,:128]! | |
523 vcvt.s32.f32 q1, q1, #16 | |
524 vsri.32 q11, q10, #16 | |
525 vld1.64 {d4-d5}, [r6,:128]! | |
526 vcvt.s32.f32 q2, q2, #16 | |
527 vzip.32 d18, d22 | |
528 vld1.64 {d6-d7}, [r7,:128]! | |
529 vcvt.s32.f32 q3, q3, #16 | |
530 vzip.32 d19, d23 | |
531 vst1.64 {d18}, [r8], ip | |
532 vsri.32 q1, q0, #16 | |
533 vst1.64 {d22}, [r8], ip | |
534 vsri.32 q3, q2, #16 | |
535 vst1.64 {d19}, [r8], ip | |
536 vzip.32 d2, d6 | |
537 vst1.64 {d23}, [r8], ip | |
538 vzip.32 d3, d7 | |
539 beq 7f | |
540 vld1.64 {d16-d17},[r4,:128]! | |
541 vcvt.s32.f32 q8, q8, #16 | |
542 vst1.64 {d2}, [r8], ip | |
543 vld1.64 {d18-d19},[r5,:128]! | |
544 vcvt.s32.f32 q9, q9, #16 | |
545 vst1.64 {d6}, [r8], ip | |
546 vld1.64 {d20-d21},[r6,:128]! | |
547 vcvt.s32.f32 q10, q10, #16 | |
548 vst1.64 {d3}, [r8], ip | |
549 vld1.64 {d22-d23},[r7,:128]! | |
550 vcvt.s32.f32 q11, q11, #16 | |
551 vst1.64 {d7}, [r8], ip | |
552 b 6b | |
553 7: vst1.64 {d2}, [r8], ip | |
554 vst1.64 {d6}, [r8], ip | |
555 vst1.64 {d3}, [r8], ip | |
556 vst1.64 {d7}, [r8], ip | |
557 subs r3, r3, #4 | |
558 popeq {r4-r8,pc} | |
559 cmp r3, #4 | |
560 add r0, r0, #8 | |
561 bge 5b | |
562 | |
563 @ 2 channels | |
564 4: cmp r3, #2 | |
565 blt 4f | |
566 ldmia r1!, {r4-r5} | |
567 mov lr, r2 | |
568 mov r8, r0 | |
569 tst lr, #8 | |
570 vld1.64 {d16-d17},[r4,:128]! | |
571 vcvt.s32.f32 q8, q8, #16 | |
572 vld1.64 {d18-d19},[r5,:128]! | |
573 vcvt.s32.f32 q9, q9, #16 | |
574 vld1.64 {d20-d21},[r4,:128]! | |
575 vcvt.s32.f32 q10, q10, #16 | |
576 vld1.64 {d22-d23},[r5,:128]! | |
577 vcvt.s32.f32 q11, q11, #16 | |
578 beq 6f | |
579 subs lr, lr, #8 | |
580 beq 7f | |
581 vsri.32 d18, d16, #16 | |
582 vsri.32 d19, d17, #16 | |
583 vld1.64 {d16-d17},[r4,:128]! | |
584 vcvt.s32.f32 q8, q8, #16 | |
585 vst1.32 {d18[0]}, [r8], ip | |
586 vsri.32 d22, d20, #16 | |
587 vst1.32 {d18[1]}, [r8], ip | |
588 vsri.32 d23, d21, #16 | |
589 vst1.32 {d19[0]}, [r8], ip | |
590 vst1.32 {d19[1]}, [r8], ip | |
591 vld1.64 {d18-d19},[r5,:128]! | |
592 vcvt.s32.f32 q9, q9, #16 | |
593 vst1.32 {d22[0]}, [r8], ip | |
594 vst1.32 {d22[1]}, [r8], ip | |
595 vld1.64 {d20-d21},[r4,:128]! | |
596 vcvt.s32.f32 q10, q10, #16 | |
597 vst1.32 {d23[0]}, [r8], ip | |
598 vst1.32 {d23[1]}, [r8], ip | |
599 vld1.64 {d22-d23},[r5,:128]! | |
600 vcvt.s32.f32 q11, q11, #16 | |
601 6: subs lr, lr, #16 | |
602 vld1.64 {d0-d1}, [r4,:128]! | |
603 vcvt.s32.f32 q0, q0, #16 | |
604 vsri.32 d18, d16, #16 | |
605 vld1.64 {d2-d3}, [r5,:128]! | |
606 vcvt.s32.f32 q1, q1, #16 | |
607 vsri.32 d19, d17, #16 | |
608 vld1.64 {d4-d5}, [r4,:128]! | |
609 vcvt.s32.f32 q2, q2, #16 | |
610 vld1.64 {d6-d7}, [r5,:128]! | |
611 vcvt.s32.f32 q3, q3, #16 | |
612 vst1.32 {d18[0]}, [r8], ip | |
613 vsri.32 d22, d20, #16 | |
614 vst1.32 {d18[1]}, [r8], ip | |
615 vsri.32 d23, d21, #16 | |
616 vst1.32 {d19[0]}, [r8], ip | |
617 vsri.32 d2, d0, #16 | |
618 vst1.32 {d19[1]}, [r8], ip | |
619 vsri.32 d3, d1, #16 | |
620 vst1.32 {d22[0]}, [r8], ip | |
621 vsri.32 d6, d4, #16 | |
622 vst1.32 {d22[1]}, [r8], ip | |
623 vsri.32 d7, d5, #16 | |
624 vst1.32 {d23[0]}, [r8], ip | |
625 vst1.32 {d23[1]}, [r8], ip | |
626 beq 6f | |
627 vld1.64 {d16-d17},[r4,:128]! | |
628 vcvt.s32.f32 q8, q8, #16 | |
629 vst1.32 {d2[0]}, [r8], ip | |
630 vst1.32 {d2[1]}, [r8], ip | |
631 vld1.64 {d18-d19},[r5,:128]! | |
632 vcvt.s32.f32 q9, q9, #16 | |
633 vst1.32 {d3[0]}, [r8], ip | |
634 vst1.32 {d3[1]}, [r8], ip | |
635 vld1.64 {d20-d21},[r4,:128]! | |
636 vcvt.s32.f32 q10, q10, #16 | |
637 vst1.32 {d6[0]}, [r8], ip | |
638 vst1.32 {d6[1]}, [r8], ip | |
639 vld1.64 {d22-d23},[r5,:128]! | |
640 vcvt.s32.f32 q11, q11, #16 | |
641 vst1.32 {d7[0]}, [r8], ip | |
642 vst1.32 {d7[1]}, [r8], ip | |
643 bgt 6b | |
644 6: vst1.32 {d2[0]}, [r8], ip | |
645 vst1.32 {d2[1]}, [r8], ip | |
646 vst1.32 {d3[0]}, [r8], ip | |
647 vst1.32 {d3[1]}, [r8], ip | |
648 vst1.32 {d6[0]}, [r8], ip | |
649 vst1.32 {d6[1]}, [r8], ip | |
650 vst1.32 {d7[0]}, [r8], ip | |
651 vst1.32 {d7[1]}, [r8], ip | |
652 b 8f | |
653 7: vsri.32 d18, d16, #16 | |
654 vsri.32 d19, d17, #16 | |
655 vst1.32 {d18[0]}, [r8], ip | |
656 vsri.32 d22, d20, #16 | |
657 vst1.32 {d18[1]}, [r8], ip | |
658 vsri.32 d23, d21, #16 | |
659 vst1.32 {d19[0]}, [r8], ip | |
660 vst1.32 {d19[1]}, [r8], ip | |
661 vst1.32 {d22[0]}, [r8], ip | |
662 vst1.32 {d22[1]}, [r8], ip | |
663 vst1.32 {d23[0]}, [r8], ip | |
664 vst1.32 {d23[1]}, [r8], ip | |
665 8: subs r3, r3, #2 | |
666 add r0, r0, #4 | |
667 popeq {r4-r8,pc} | |
668 | |
669 @ 1 channel | |
670 4: ldr r4, [r1],#4 | |
671 tst r2, #8 | |
672 mov lr, r2 | |
673 mov r5, r0 | |
674 vld1.64 {d0-d1}, [r4,:128]! | |
675 vcvt.s32.f32 q0, q0, #16 | |
676 vld1.64 {d2-d3}, [r4,:128]! | |
677 vcvt.s32.f32 q1, q1, #16 | |
678 bne 8f | |
679 6: subs lr, lr, #16 | |
680 vld1.64 {d4-d5}, [r4,:128]! | |
681 vcvt.s32.f32 q2, q2, #16 | |
682 vld1.64 {d6-d7}, [r4,:128]! | |
683 vcvt.s32.f32 q3, q3, #16 | |
684 vst1.16 {d0[1]}, [r5,:16], ip | |
685 vst1.16 {d0[3]}, [r5,:16], ip | |
686 vst1.16 {d1[1]}, [r5,:16], ip | |
687 vst1.16 {d1[3]}, [r5,:16], ip | |
688 vst1.16 {d2[1]}, [r5,:16], ip | |
689 vst1.16 {d2[3]}, [r5,:16], ip | |
690 vst1.16 {d3[1]}, [r5,:16], ip | |
691 vst1.16 {d3[3]}, [r5,:16], ip | |
692 beq 7f | |
693 vld1.64 {d0-d1}, [r4,:128]! | |
694 vcvt.s32.f32 q0, q0, #16 | |
695 vld1.64 {d2-d3}, [r4,:128]! | |
696 vcvt.s32.f32 q1, q1, #16 | |
697 7: vst1.16 {d4[1]}, [r5,:16], ip | |
698 vst1.16 {d4[3]}, [r5,:16], ip | |
699 vst1.16 {d5[1]}, [r5,:16], ip | |
700 vst1.16 {d5[3]}, [r5,:16], ip | |
701 vst1.16 {d6[1]}, [r5,:16], ip | |
702 vst1.16 {d6[3]}, [r5,:16], ip | |
703 vst1.16 {d7[1]}, [r5,:16], ip | |
704 vst1.16 {d7[3]}, [r5,:16], ip | |
705 bgt 6b | |
706 pop {r4-r8,pc} | |
707 8: subs lr, lr, #8 | |
708 vst1.16 {d0[1]}, [r5,:16], ip | |
709 vst1.16 {d0[3]}, [r5,:16], ip | |
710 vst1.16 {d1[1]}, [r5,:16], ip | |
711 vst1.16 {d1[3]}, [r5,:16], ip | |
712 vst1.16 {d2[1]}, [r5,:16], ip | |
713 vst1.16 {d2[3]}, [r5,:16], ip | |
714 vst1.16 {d3[1]}, [r5,:16], ip | |
715 vst1.16 {d3[3]}, [r5,:16], ip | |
716 popeq {r4-r8,pc} | |
717 vld1.64 {d0-d1}, [r4,:128]! | |
718 vcvt.s32.f32 q0, q0, #16 | |
719 vld1.64 {d2-d3}, [r4,:128]! | |
720 vcvt.s32.f32 q1, q1, #16 | |
721 b 6b | |
11443 | 722 endfunc |
8697 | 723 |
724 function ff_vector_fmul_neon, export=1 | |
725 mov r3, r0 | |
726 subs r2, r2, #8 | |
727 vld1.64 {d0-d3}, [r0,:128]! | |
728 vld1.64 {d4-d7}, [r1,:128]! | |
729 vmul.f32 q8, q0, q2 | |
730 vmul.f32 q9, q1, q3 | |
731 beq 3f | |
732 bics ip, r2, #15 | |
733 beq 2f | |
734 1: subs ip, ip, #16 | |
735 vld1.64 {d0-d1}, [r0,:128]! | |
736 vld1.64 {d4-d5}, [r1,:128]! | |
737 vmul.f32 q10, q0, q2 | |
738 vld1.64 {d2-d3}, [r0,:128]! | |
739 vld1.64 {d6-d7}, [r1,:128]! | |
740 vmul.f32 q11, q1, q3 | |
741 vst1.64 {d16-d19},[r3,:128]! | |
742 vld1.64 {d0-d1}, [r0,:128]! | |
743 vld1.64 {d4-d5}, [r1,:128]! | |
744 vmul.f32 q8, q0, q2 | |
745 vld1.64 {d2-d3}, [r0,:128]! | |
746 vld1.64 {d6-d7}, [r1,:128]! | |
747 vmul.f32 q9, q1, q3 | |
748 vst1.64 {d20-d23},[r3,:128]! | |
749 bne 1b | |
750 ands r2, r2, #15 | |
751 beq 3f | |
752 2: vld1.64 {d0-d1}, [r0,:128]! | |
753 vld1.64 {d4-d5}, [r1,:128]! | |
754 vst1.64 {d16-d17},[r3,:128]! | |
755 vmul.f32 q8, q0, q2 | |
756 vld1.64 {d2-d3}, [r0,:128]! | |
757 vld1.64 {d6-d7}, [r1,:128]! | |
758 vst1.64 {d18-d19},[r3,:128]! | |
759 vmul.f32 q9, q1, q3 | |
760 3: vst1.64 {d16-d19},[r3,:128]! | |
761 bx lr | |
11443 | 762 endfunc |
8698 | 763 |
764 function ff_vector_fmul_window_neon, export=1 | |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
765 VFP vdup.32 q8, d0[0] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
8698 | 767 push {r4,r5,lr} |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
768 VFP ldr lr, [sp, #12] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
769 NOVFP ldr lr, [sp, #16] |
8698 | 770 sub r2, r2, #8 |
771 sub r5, lr, #2 | |
772 add r2, r2, r5, lsl #2 | |
773 add r4, r3, r5, lsl #3 | |
774 add ip, r0, r5, lsl #3 | |
775 mov r5, #-16 | |
776 vld1.64 {d0,d1}, [r1,:128]! | |
777 vld1.64 {d2,d3}, [r2,:128], r5 | |
778 vld1.64 {d4,d5}, [r3,:128]! | |
779 vld1.64 {d6,d7}, [r4,:128], r5 | |
780 1: subs lr, lr, #4 | |
781 vmov q11, q8 | |
782 vmla.f32 d22, d0, d4 | |
783 vmov q10, q8 | |
784 vmla.f32 d23, d1, d5 | |
785 vrev64.32 q3, q3 | |
786 vmla.f32 d20, d0, d7 | |
787 vrev64.32 q1, q1 | |
788 vmla.f32 d21, d1, d6 | |
789 beq 2f | |
790 vmla.f32 d22, d3, d7 | |
791 vld1.64 {d0,d1}, [r1,:128]! | |
792 vmla.f32 d23, d2, d6 | |
793 vld1.64 {d18,d19},[r2,:128], r5 | |
794 vmls.f32 d20, d3, d4 | |
795 vld1.64 {d24,d25},[r3,:128]! | |
796 vmls.f32 d21, d2, d5 | |
797 vld1.64 {d6,d7}, [r4,:128], r5 | |
798 vmov q1, q9 | |
799 vrev64.32 q11, q11 | |
800 vmov q2, q12 | |
801 vswp d22, d23 | |
802 vst1.64 {d20,d21},[r0,:128]! | |
803 vst1.64 {d22,d23},[ip,:128], r5 | |
804 b 1b | |
805 2: vmla.f32 d22, d3, d7 | |
806 vmla.f32 d23, d2, d6 | |
807 vmls.f32 d20, d3, d4 | |
808 vmls.f32 d21, d2, d5 | |
809 vrev64.32 q11, q11 | |
810 vswp d22, d23 | |
811 vst1.64 {d20,d21},[r0,:128]! | |
812 vst1.64 {d22,d23},[ip,:128], r5 | |
813 pop {r4,r5,pc} | |
11443 | 814 endfunc |
10046 | 815 |
816 #if CONFIG_VORBIS_DECODER | |
817 function ff_vorbis_inverse_coupling_neon, export=1 | |
818 vmov.i32 q10, #1<<31 | |
819 subs r2, r2, #4 | |
820 mov r3, r0 | |
821 mov r12, r1 | |
822 beq 3f | |
823 | |
824 vld1.32 {d24-d25},[r1,:128]! | |
825 vld1.32 {d22-d23},[r0,:128]! | |
826 vcle.s32 q8, q12, #0 | |
827 vand q9, q11, q10 | |
828 veor q12, q12, q9 | |
829 vand q2, q12, q8 | |
830 vbic q3, q12, q8 | |
831 vadd.f32 q12, q11, q2 | |
832 vsub.f32 q11, q11, q3 | |
833 1: vld1.32 {d2-d3}, [r1,:128]! | |
834 vld1.32 {d0-d1}, [r0,:128]! | |
835 vcle.s32 q8, q1, #0 | |
836 vand q9, q0, q10 | |
837 veor q1, q1, q9 | |
838 vst1.32 {d24-d25},[r3, :128]! | |
839 vst1.32 {d22-d23},[r12,:128]! | |
840 vand q2, q1, q8 | |
841 vbic q3, q1, q8 | |
842 vadd.f32 q1, q0, q2 | |
843 vsub.f32 q0, q0, q3 | |
844 subs r2, r2, #8 | |
845 ble 2f | |
846 vld1.32 {d24-d25},[r1,:128]! | |
847 vld1.32 {d22-d23},[r0,:128]! | |
848 vcle.s32 q8, q12, #0 | |
849 vand q9, q11, q10 | |
850 veor q12, q12, q9 | |
851 vst1.32 {d2-d3}, [r3, :128]! | |
852 vst1.32 {d0-d1}, [r12,:128]! | |
853 vand q2, q12, q8 | |
854 vbic q3, q12, q8 | |
855 vadd.f32 q12, q11, q2 | |
856 vsub.f32 q11, q11, q3 | |
857 b 1b | |
858 | |
859 2: vst1.32 {d2-d3}, [r3, :128]! | |
860 vst1.32 {d0-d1}, [r12,:128]! | |
861 bxlt lr | |
862 | |
863 3: vld1.32 {d2-d3}, [r1,:128] | |
864 vld1.32 {d0-d1}, [r0,:128] | |
865 vcle.s32 q8, q1, #0 | |
866 vand q9, q0, q10 | |
867 veor q1, q1, q9 | |
868 vand q2, q1, q8 | |
869 vbic q3, q1, q8 | |
870 vadd.f32 q1, q0, q2 | |
871 vsub.f32 q0, q0, q3 | |
872 vst1.32 {d2-d3}, [r0,:128]! | |
873 vst1.32 {d0-d1}, [r1,:128]! | |
874 bx lr | |
11443 | 875 endfunc |
10046 | 876 #endif |
10221 | 877 |
878 function ff_vector_fmul_scalar_neon, export=1 | |
879 VFP len .req r2 | |
880 NOVFP len .req r3 | |
881 VFP vdup.32 q8, d0[0] | |
882 NOVFP vdup.32 q8, r2 | |
883 bics r12, len, #15 | |
884 beq 3f | |
885 vld1.32 {q0},[r1,:128]! | |
886 vld1.32 {q1},[r1,:128]! | |
887 1: vmul.f32 q0, q0, q8 | |
888 vld1.32 {q2},[r1,:128]! | |
889 vmul.f32 q1, q1, q8 | |
890 vld1.32 {q3},[r1,:128]! | |
891 vmul.f32 q2, q2, q8 | |
892 vst1.32 {q0},[r0,:128]! | |
893 vmul.f32 q3, q3, q8 | |
894 vst1.32 {q1},[r0,:128]! | |
895 subs r12, r12, #16 | |
896 beq 2f | |
897 vld1.32 {q0},[r1,:128]! | |
898 vst1.32 {q2},[r0,:128]! | |
899 vld1.32 {q1},[r1,:128]! | |
900 vst1.32 {q3},[r0,:128]! | |
901 b 1b | |
902 2: vst1.32 {q2},[r0,:128]! | |
903 vst1.32 {q3},[r0,:128]! | |
904 ands len, len, #15 | |
905 bxeq lr | |
906 3: vld1.32 {q0},[r1,:128]! | |
907 vmul.f32 q0, q0, q8 | |
908 vst1.32 {q0},[r0,:128]! | |
909 subs len, len, #4 | |
910 bgt 3b | |
911 bx lr | |
912 .unreq len | |
11443 | 913 endfunc |
10221 | 914 |
915 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
916 VFP vdup.32 d16, d0[0] | |
917 NOVFP vdup.32 d16, r3 | |
918 NOVFP ldr r3, [sp] | |
919 vld1.32 {d0},[r1,:64]! | |
920 vld1.32 {d1},[r1,:64]! | |
921 1: subs r3, r3, #4 | |
922 vmul.f32 d4, d0, d16 | |
923 vmul.f32 d5, d1, d16 | |
924 ldr r12, [r2], #4 | |
925 vld1.32 {d2},[r12,:64] | |
926 ldr r12, [r2], #4 | |
927 vld1.32 {d3},[r12,:64] | |
928 vmul.f32 d4, d4, d2 | |
929 vmul.f32 d5, d5, d3 | |
930 beq 2f | |
931 vld1.32 {d0},[r1,:64]! | |
932 vld1.32 {d1},[r1,:64]! | |
933 vst1.32 {d4},[r0,:64]! | |
934 vst1.32 {d5},[r0,:64]! | |
935 b 1b | |
936 2: vst1.32 {d4},[r0,:64]! | |
937 vst1.32 {d5},[r0,:64]! | |
938 bx lr | |
11443 | 939 endfunc |
10221 | 940 |
941 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
942 VFP vdup.32 q10, d0[0] | |
943 NOVFP vdup.32 q10, r3 | |
944 NOVFP ldr r3, [sp] | |
945 push {lr} | |
946 bics lr, r3, #7 | |
947 beq 3f | |
948 vld1.32 {q0},[r1,:128]! | |
949 vld1.32 {q2},[r1,:128]! | |
950 1: ldr r12, [r2], #4 | |
951 vld1.32 {q1},[r12,:128] | |
952 ldr r12, [r2], #4 | |
953 vld1.32 {q3},[r12,:128] | |
954 vmul.f32 q8, q0, q10 | |
955 vmul.f32 q8, q8, q1 | |
956 vmul.f32 q9, q2, q10 | |
957 vmul.f32 q9, q9, q3 | |
958 subs lr, lr, #8 | |
959 beq 2f | |
960 vld1.32 {q0},[r1,:128]! | |
961 vld1.32 {q2},[r1,:128]! | |
962 vst1.32 {q8},[r0,:128]! | |
963 vst1.32 {q9},[r0,:128]! | |
964 b 1b | |
965 2: vst1.32 {q8},[r0,:128]! | |
966 vst1.32 {q9},[r0,:128]! | |
967 ands r3, r3, #7 | |
968 popeq {pc} | |
969 3: vld1.32 {q0},[r1,:128]! | |
970 ldr r12, [r2], #4 | |
971 vld1.32 {q1},[r12,:128] | |
972 vmul.f32 q0, q0, q10 | |
973 vmul.f32 q0, q0, q1 | |
974 vst1.32 {q0},[r0,:128]! | |
975 subs r3, r3, #4 | |
976 bgt 3b | |
977 pop {pc} | |
11443 | 978 endfunc |
10221 | 979 |
980 function ff_sv_fmul_scalar_2_neon, export=1 | |
981 VFP len .req r2 | |
982 NOVFP len .req r3 | |
983 VFP vdup.32 q8, d0[0] | |
984 NOVFP vdup.32 q8, r2 | |
985 ldr r12, [r1], #4 | |
986 vld1.32 {d0},[r12,:64] | |
987 ldr r12, [r1], #4 | |
988 vld1.32 {d1},[r12,:64] | |
989 1: vmul.f32 q1, q0, q8 | |
990 subs len, len, #4 | |
991 beq 2f | |
992 ldr r12, [r1], #4 | |
993 vld1.32 {d0},[r12,:64] | |
994 ldr r12, [r1], #4 | |
995 vld1.32 {d1},[r12,:64] | |
996 vst1.32 {q1},[r0,:128]! | |
997 b 1b | |
998 2: vst1.32 {q1},[r0,:128]! | |
999 bx lr | |
1000 .unreq len | |
11443 | 1001 endfunc |
10221 | 1002 |
1003 function ff_sv_fmul_scalar_4_neon, export=1 | |
1004 VFP len .req r2 | |
1005 NOVFP len .req r3 | |
1006 VFP vdup.32 q8, d0[0] | |
1007 NOVFP vdup.32 q8, r2 | |
1008 1: ldr r12, [r1], #4 | |
1009 vld1.32 {q0},[r12,:128] | |
1010 vmul.f32 q0, q0, q8 | |
1011 vst1.32 {q0},[r0,:128]! | |
1012 subs len, len, #4 | |
1013 bgt 1b | |
1014 bx lr | |
1015 .unreq len | |
11443 | 1016 endfunc |
10221 | 1017 |
1018 function ff_butterflies_float_neon, export=1 | |
1019 1: vld1.32 {q0},[r0,:128] | |
1020 vld1.32 {q1},[r1,:128] | |
1021 vsub.f32 q2, q0, q1 | |
1022 vadd.f32 q1, q0, q1 | |
1023 vst1.32 {q2},[r1,:128]! | |
1024 vst1.32 {q1},[r0,:128]! | |
1025 subs r2, r2, #4 | |
1026 bgt 1b | |
1027 bx lr | |
11443 | 1028 endfunc |
10228 | 1029 |
1030 function ff_scalarproduct_float_neon, export=1 | |
1031 vmov.f32 q2, #0.0 | |
1032 1: vld1.32 {q0},[r0,:128]! | |
1033 vld1.32 {q1},[r1,:128]! | |
1034 vmla.f32 q2, q0, q1 | |
1035 subs r2, r2, #4 | |
1036 bgt 1b | |
1037 vadd.f32 d0, d4, d5 | |
1038 vpadd.f32 d0, d0, d0 | |
1039 NOVFP vmov.32 r0, d0[0] | |
1040 bx lr | |
11443 | 1041 endfunc |
10253 | 1042 |
1043 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1044 VFP vdup.32 q0, d0[0] | |
1045 VFP len .req r2 | |
1046 NOVFP vdup.32 q0, r2 | |
1047 NOVFP len .req r3 | |
1048 | |
1049 vld1.32 {q1},[r1,:128]! | |
1050 vcvt.f32.s32 q3, q1 | |
1051 vld1.32 {q2},[r1,:128]! | |
1052 vcvt.f32.s32 q8, q2 | |
1053 1: subs len, len, #8 | |
1054 pld [r1, #16] | |
1055 vmul.f32 q9, q3, q0 | |
1056 vmul.f32 q10, q8, q0 | |
1057 beq 2f | |
1058 vld1.32 {q1},[r1,:128]! | |
1059 vcvt.f32.s32 q3, q1 | |
1060 vld1.32 {q2},[r1,:128]! | |
1061 vcvt.f32.s32 q8, q2 | |
1062 vst1.32 {q9}, [r0,:128]! | |
1063 vst1.32 {q10},[r0,:128]! | |
1064 b 1b | |
1065 2: vst1.32 {q9}, [r0,:128]! | |
1066 vst1.32 {q10},[r0,:128]! | |
1067 bx lr | |
1068 .unreq len | |
11443 | 1069 endfunc |
10274 | 1070 |
1071 function ff_vector_fmul_reverse_neon, export=1 | |
1072 add r2, r2, r3, lsl #2 | |
1073 sub r2, r2, #32 | |
1074 mov r12, #-32 | |
1075 vld1.32 {q0-q1}, [r1,:128]! | |
1076 vld1.32 {q2-q3}, [r2,:128], r12 | |
1077 1: pld [r1, #32] | |
1078 vrev64.32 q3, q3 | |
1079 vmul.f32 d16, d0, d7 | |
1080 vmul.f32 d17, d1, d6 | |
1081 pld [r2, #-32] | |
1082 vrev64.32 q2, q2 | |
1083 vmul.f32 d18, d2, d5 | |
1084 vmul.f32 d19, d3, d4 | |
1085 subs r3, r3, #8 | |
1086 beq 2f | |
1087 vld1.32 {q0-q1}, [r1,:128]! | |
1088 vld1.32 {q2-q3}, [r2,:128], r12 | |
1089 vst1.32 {q8-q9}, [r0,:128]! | |
1090 b 1b | |
1091 2: vst1.32 {q8-q9}, [r0,:128]! | |
1092 bx lr | |
11443 | 1093 endfunc |
10276 | 1094 |
10302 | 1095 function ff_vector_fmul_add_neon, export=1 |
1096 ldr r12, [sp] | |
1097 vld1.32 {q0-q1}, [r1,:128]! | |
1098 vld1.32 {q8-q9}, [r2,:128]! | |
1099 vld1.32 {q2-q3}, [r3,:128]! | |
1100 vmul.f32 q10, q0, q8 | |
1101 vmul.f32 q11, q1, q9 | |
1102 1: vadd.f32 q12, q2, q10 | |
1103 vadd.f32 q13, q3, q11 | |
1104 pld [r1, #16] | |
1105 pld [r2, #16] | |
1106 pld [r3, #16] | |
1107 subs r12, r12, #8 | |
1108 beq 2f | |
1109 vld1.32 {q0}, [r1,:128]! | |
1110 vld1.32 {q8}, [r2,:128]! | |
1111 vmul.f32 q10, q0, q8 | |
1112 vld1.32 {q1}, [r1,:128]! | |
1113 vld1.32 {q9}, [r2,:128]! | |
1114 vmul.f32 q11, q1, q9 | |
1115 vld1.32 {q2-q3}, [r3,:128]! | |
1116 vst1.32 {q12-q13},[r0,:128]! | |
1117 b 1b | |
1118 2: vst1.32 {q12-q13},[r0,:128]! | |
1119 bx lr | |
11443 | 1120 endfunc |
10302 | 1121 |
10276 | 1122 function ff_vector_clipf_neon, export=1 |
1123 VFP vdup.32 q1, d0[1] | |
1124 VFP vdup.32 q0, d0[0] | |
1125 NOVFP vdup.32 q0, r2 | |
1126 NOVFP vdup.32 q1, r3 | |
1127 NOVFP ldr r2, [sp] | |
1128 vld1.f32 {q2},[r1,:128]! | |
1129 vmin.f32 q10, q2, q1 | |
1130 vld1.f32 {q3},[r1,:128]! | |
1131 vmin.f32 q11, q3, q1 | |
1132 1: vmax.f32 q8, q10, q0 | |
1133 vmax.f32 q9, q11, q0 | |
1134 subs r2, r2, #8 | |
1135 beq 2f | |
1136 vld1.f32 {q2},[r1,:128]! | |
1137 vmin.f32 q10, q2, q1 | |
1138 vld1.f32 {q3},[r1,:128]! | |
1139 vmin.f32 q11, q3, q1 | |
1140 vst1.f32 {q8},[r0,:128]! | |
1141 vst1.f32 {q9},[r0,:128]! | |
1142 b 1b | |
1143 2: vst1.f32 {q8},[r0,:128]! | |
1144 vst1.f32 {q9},[r0,:128]! | |
1145 bx lr | |
11443 | 1146 endfunc |