Mercurial > libavcodec.hg
annotate arm/dsputil_neon.S @ 10975:cec4a174365c libavcodec
Reindent the content of one if(){} in ff_dxva2_common_end_frame.
author | fenrir |
---|---|
date | Sat, 23 Jan 2010 18:00:51 +0000 |
parents | 5c5b864d66e1 |
children | 361a5fcb4393 |
rev | line source |
---|---|
8334 | 1 /* |
2 * ARM NEON optimised DSP functions | |
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
10046 | 22 #include "config.h" |
8334 | 23 #include "asm.S" |
24 | |
25 preserve8 | |
26 .text | |
27 | |
28 .macro pixels16 avg=0 | |
29 .if \avg | |
30 mov ip, r0 | |
31 .endif | |
32 1: vld1.64 {d0, d1}, [r1], r2 | |
33 vld1.64 {d2, d3}, [r1], r2 | |
34 vld1.64 {d4, d5}, [r1], r2 | |
35 pld [r1, r2, lsl #2] | |
36 vld1.64 {d6, d7}, [r1], r2 | |
37 pld [r1] | |
38 pld [r1, r2] | |
39 pld [r1, r2, lsl #1] | |
40 .if \avg | |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
41 vld1.64 {d16,d17}, [ip,:128], r2 |
8334 | 42 vrhadd.u8 q0, q0, q8 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
43 vld1.64 {d18,d19}, [ip,:128], r2 |
8334 | 44 vrhadd.u8 q1, q1, q9 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
45 vld1.64 {d20,d21}, [ip,:128], r2 |
8334 | 46 vrhadd.u8 q2, q2, q10 |
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
47 vld1.64 {d22,d23}, [ip,:128], r2 |
8334 | 48 vrhadd.u8 q3, q3, q11 |
49 .endif | |
50 subs r3, r3, #4 | |
51 vst1.64 {d0, d1}, [r0,:128], r2 | |
52 vst1.64 {d2, d3}, [r0,:128], r2 | |
53 vst1.64 {d4, d5}, [r0,:128], r2 | |
54 vst1.64 {d6, d7}, [r0,:128], r2 | |
55 bne 1b | |
56 bx lr | |
57 .endm | |
58 | |
59 .macro pixels16_x2 vhadd=vrhadd.u8 | |
60 1: vld1.64 {d0-d2}, [r1], r2 | |
61 vld1.64 {d4-d6}, [r1], r2 | |
62 pld [r1] | |
63 pld [r1, r2] | |
64 subs r3, r3, #2 | |
65 vext.8 q1, q0, q1, #1 | |
66 \vhadd q0, q0, q1 | |
67 vext.8 q3, q2, q3, #1 | |
68 \vhadd q2, q2, q3 | |
69 vst1.64 {d0, d1}, [r0,:128], r2 | |
70 vst1.64 {d4, d5}, [r0,:128], r2 | |
71 bne 1b | |
72 bx lr | |
73 .endm | |
74 | |
75 .macro pixels16_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
76 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
77 vld1.64 {d2, d3}, [r1], r2 |
8334 | 78 1: subs r3, r3, #2 |
79 \vhadd q2, q0, q1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
80 vld1.64 {d0, d1}, [r1], r2 |
8334 | 81 \vhadd q3, q0, q1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
82 vld1.64 {d2, d3}, [r1], r2 |
8334 | 83 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
84 pld [r1, r2] |
8334 | 85 vst1.64 {d4, d5}, [r0,:128], r2 |
86 vst1.64 {d6, d7}, [r0,:128], r2 | |
87 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
88 bx lr |
8334 | 89 .endm |
90 | |
91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
92 vld1.64 {d0-d2}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d4-d6}, [r1], r2 |
8334 | 94 .if \no_rnd |
95 vmov.i16 q13, #1 | |
96 .endif | |
97 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
98 pld [r1, r2] |
8334 | 99 vext.8 q1, q0, q1, #1 |
100 vext.8 q3, q2, q3, #1 | |
101 vaddl.u8 q8, d0, d2 | |
102 vaddl.u8 q10, d1, d3 | |
103 vaddl.u8 q9, d4, d6 | |
104 vaddl.u8 q11, d5, d7 | |
105 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
106 vld1.64 {d0-d2}, [r1], r2 |
8334 | 107 vadd.u16 q12, q8, q9 |
108 pld [r1] | |
109 .if \no_rnd | |
110 vadd.u16 q12, q12, q13 | |
111 .endif | |
112 vext.8 q15, q0, q1, #1 | |
113 vadd.u16 q1 , q10, q11 | |
114 \vshrn d28, q12, #2 | |
115 .if \no_rnd | |
116 vadd.u16 q1, q1, q13 | |
117 .endif | |
118 \vshrn d29, q1, #2 | |
119 vaddl.u8 q8, d0, d30 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
120 vld1.64 {d2-d4}, [r1], r2 |
8334 | 121 vaddl.u8 q10, d1, d31 |
122 vst1.64 {d28,d29}, [r0,:128], r2 | |
123 vadd.u16 q12, q8, q9 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
124 pld [r1, r2] |
8334 | 125 .if \no_rnd |
126 vadd.u16 q12, q12, q13 | |
127 .endif | |
128 vext.8 q2, q1, q2, #1 | |
129 vadd.u16 q0, q10, q11 | |
130 \vshrn d30, q12, #2 | |
131 .if \no_rnd | |
132 vadd.u16 q0, q0, q13 | |
133 .endif | |
134 \vshrn d31, q0, #2 | |
135 vaddl.u8 q9, d2, d4 | |
136 vaddl.u8 q11, d3, d5 | |
137 vst1.64 {d30,d31}, [r0,:128], r2 | |
138 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
139 bx lr |
8334 | 140 .endm |
141 | |
10375 | 142 .macro pixels8 avg=0 |
8334 | 143 1: vld1.64 {d0}, [r1], r2 |
144 vld1.64 {d1}, [r1], r2 | |
145 vld1.64 {d2}, [r1], r2 | |
146 pld [r1, r2, lsl #2] | |
147 vld1.64 {d3}, [r1], r2 | |
148 pld [r1] | |
149 pld [r1, r2] | |
150 pld [r1, r2, lsl #1] | |
10375 | 151 .if \avg |
152 vld1.64 {d4}, [r0,:64], r2 | |
153 vrhadd.u8 d0, d0, d4 | |
154 vld1.64 {d5}, [r0,:64], r2 | |
155 vrhadd.u8 d1, d1, d5 | |
156 vld1.64 {d6}, [r0,:64], r2 | |
157 vrhadd.u8 d2, d2, d6 | |
158 vld1.64 {d7}, [r0,:64], r2 | |
159 vrhadd.u8 d3, d3, d7 | |
160 sub r0, r0, r2, lsl #2 | |
161 .endif | |
8334 | 162 subs r3, r3, #4 |
163 vst1.64 {d0}, [r0,:64], r2 | |
164 vst1.64 {d1}, [r0,:64], r2 | |
165 vst1.64 {d2}, [r0,:64], r2 | |
166 vst1.64 {d3}, [r0,:64], r2 | |
167 bne 1b | |
168 bx lr | |
169 .endm | |
170 | |
171 .macro pixels8_x2 vhadd=vrhadd.u8 | |
172 1: vld1.64 {d0, d1}, [r1], r2 | |
173 vext.8 d1, d0, d1, #1 | |
174 vld1.64 {d2, d3}, [r1], r2 | |
175 vext.8 d3, d2, d3, #1 | |
176 pld [r1] | |
177 pld [r1, r2] | |
178 subs r3, r3, #2 | |
179 vswp d1, d2 | |
180 \vhadd q0, q0, q1 | |
181 vst1.64 {d0}, [r0,:64], r2 | |
182 vst1.64 {d1}, [r0,:64], r2 | |
183 bne 1b | |
184 bx lr | |
185 .endm | |
186 | |
187 .macro pixels8_y2 vhadd=vrhadd.u8 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
188 vld1.64 {d0}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
189 vld1.64 {d1}, [r1], r2 |
8334 | 190 1: subs r3, r3, #2 |
191 \vhadd d4, d0, d1 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
192 vld1.64 {d0}, [r1], r2 |
8334 | 193 \vhadd d5, d0, d1 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
194 vld1.64 {d1}, [r1], r2 |
8334 | 195 pld [r1] |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
196 pld [r1, r2] |
8334 | 197 vst1.64 {d4}, [r0,:64], r2 |
198 vst1.64 {d5}, [r0,:64], r2 | |
199 bne 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
200 bx lr |
8334 | 201 .endm |
202 | |
203 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
204 vld1.64 {d0, d1}, [r1], r2 |
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
205 vld1.64 {d2, d3}, [r1], r2 |
8334 | 206 .if \no_rnd |
207 vmov.i16 q11, #1 | |
208 .endif | |
209 pld [r1] | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
210 pld [r1, r2] |
8334 | 211 vext.8 d4, d0, d1, #1 |
212 vext.8 d6, d2, d3, #1 | |
213 vaddl.u8 q8, d0, d4 | |
214 vaddl.u8 q9, d2, d6 | |
215 1: subs r3, r3, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
216 vld1.64 {d0, d1}, [r1], r2 |
8334 | 217 pld [r1] |
218 vadd.u16 q10, q8, q9 | |
219 vext.8 d4, d0, d1, #1 | |
220 .if \no_rnd | |
221 vadd.u16 q10, q10, q11 | |
222 .endif | |
223 vaddl.u8 q8, d0, d4 | |
224 \vshrn d5, q10, #2 | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
225 vld1.64 {d2, d3}, [r1], r2 |
8334 | 226 vadd.u16 q10, q8, q9 |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
227 pld [r1, r2] |
8334 | 228 .if \no_rnd |
229 vadd.u16 q10, q10, q11 | |
230 .endif | |
231 vst1.64 {d5}, [r0,:64], r2 | |
232 \vshrn d7, q10, #2 | |
233 vext.8 d6, d2, d3, #1 | |
234 vaddl.u8 q9, d2, d6 | |
235 vst1.64 {d7}, [r0,:64], r2 | |
236 bgt 1b | |
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
237 bx lr |
8334 | 238 .endm |
239 | |
240 .macro pixfunc pfx name suf rnd_op args:vararg | |
241 function ff_\pfx\name\suf\()_neon, export=1 | |
242 \name \rnd_op \args | |
243 .endfunc | |
244 .endm | |
245 | |
246 .macro pixfunc2 pfx name args:vararg | |
247 pixfunc \pfx \name | |
248 pixfunc \pfx \name \args | |
249 .endm | |
250 | |
251 function ff_put_h264_qpel16_mc00_neon, export=1 | |
10376 | 252 mov r3, #16 |
8334 | 253 .endfunc |
254 | |
255 pixfunc put_ pixels16 | |
256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
259 | |
260 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
10376 | 261 mov r3, #16 |
8334 | 262 .endfunc |
263 | |
264 pixfunc avg_ pixels16,, 1 | |
265 | |
266 function ff_put_h264_qpel8_mc00_neon, export=1 | |
10376 | 267 mov r3, #8 |
8334 | 268 .endfunc |
269 | |
270 pixfunc put_ pixels8 | |
271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
8492 | 274 |
10375 | 275 function ff_avg_h264_qpel8_mc00_neon, export=1 |
276 mov r3, #8 | |
277 .endfunc | |
278 | |
279 pixfunc avg_ pixels8,, 1 | |
280 | |
9580 | 281 function ff_put_pixels_clamped_neon, export=1 |
282 vld1.64 {d16-d19}, [r0,:128]! | |
283 vqmovun.s16 d0, q8 | |
284 vld1.64 {d20-d23}, [r0,:128]! | |
285 vqmovun.s16 d1, q9 | |
286 vld1.64 {d24-d27}, [r0,:128]! | |
287 vqmovun.s16 d2, q10 | |
288 vld1.64 {d28-d31}, [r0,:128]! | |
289 vqmovun.s16 d3, q11 | |
290 vst1.64 {d0}, [r1,:64], r2 | |
291 vqmovun.s16 d4, q12 | |
292 vst1.64 {d1}, [r1,:64], r2 | |
293 vqmovun.s16 d5, q13 | |
294 vst1.64 {d2}, [r1,:64], r2 | |
295 vqmovun.s16 d6, q14 | |
296 vst1.64 {d3}, [r1,:64], r2 | |
297 vqmovun.s16 d7, q15 | |
298 vst1.64 {d4}, [r1,:64], r2 | |
299 vst1.64 {d5}, [r1,:64], r2 | |
300 vst1.64 {d6}, [r1,:64], r2 | |
301 vst1.64 {d7}, [r1,:64], r2 | |
302 bx lr | |
303 .endfunc | |
304 | |
9345 | 305 function ff_put_signed_pixels_clamped_neon, export=1 |
306 vmov.u8 d31, #128 | |
307 vld1.64 {d16-d17}, [r0,:128]! | |
308 vqmovn.s16 d0, q8 | |
309 vld1.64 {d18-d19}, [r0,:128]! | |
310 vqmovn.s16 d1, q9 | |
311 vld1.64 {d16-d17}, [r0,:128]! | |
312 vqmovn.s16 d2, q8 | |
313 vld1.64 {d18-d19}, [r0,:128]! | |
314 vadd.u8 d0, d0, d31 | |
315 vld1.64 {d20-d21}, [r0,:128]! | |
316 vadd.u8 d1, d1, d31 | |
317 vld1.64 {d22-d23}, [r0,:128]! | |
318 vadd.u8 d2, d2, d31 | |
319 vst1.64 {d0}, [r1,:64], r2 | |
320 vqmovn.s16 d3, q9 | |
321 vst1.64 {d1}, [r1,:64], r2 | |
322 vqmovn.s16 d4, q10 | |
323 vst1.64 {d2}, [r1,:64], r2 | |
324 vqmovn.s16 d5, q11 | |
325 vld1.64 {d24-d25}, [r0,:128]! | |
326 vadd.u8 d3, d3, d31 | |
327 vld1.64 {d26-d27}, [r0,:128]! | |
328 vadd.u8 d4, d4, d31 | |
329 vadd.u8 d5, d5, d31 | |
330 vst1.64 {d3}, [r1,:64], r2 | |
331 vqmovn.s16 d6, q12 | |
332 vst1.64 {d4}, [r1,:64], r2 | |
333 vqmovn.s16 d7, q13 | |
334 vst1.64 {d5}, [r1,:64], r2 | |
335 vadd.u8 d6, d6, d31 | |
336 vadd.u8 d7, d7, d31 | |
337 vst1.64 {d6}, [r1,:64], r2 | |
338 vst1.64 {d7}, [r1,:64], r2 | |
339 bx lr | |
340 .endfunc | |
341 | |
9344 | 342 function ff_add_pixels_clamped_neon, export=1 |
343 mov r3, r1 | |
344 vld1.64 {d16}, [r1,:64], r2 | |
345 vld1.64 {d0-d1}, [r0,:128]! | |
346 vaddw.u8 q0, q0, d16 | |
347 vld1.64 {d17}, [r1,:64], r2 | |
348 vld1.64 {d2-d3}, [r0,:128]! | |
349 vqmovun.s16 d0, q0 | |
350 vld1.64 {d18}, [r1,:64], r2 | |
351 vaddw.u8 q1, q1, d17 | |
352 vld1.64 {d4-d5}, [r0,:128]! | |
353 vaddw.u8 q2, q2, d18 | |
354 vst1.64 {d0}, [r3,:64], r2 | |
355 vqmovun.s16 d2, q1 | |
356 vld1.64 {d19}, [r1,:64], r2 | |
357 vld1.64 {d6-d7}, [r0,:128]! | |
358 vaddw.u8 q3, q3, d19 | |
359 vqmovun.s16 d4, q2 | |
360 vst1.64 {d2}, [r3,:64], r2 | |
361 vld1.64 {d16}, [r1,:64], r2 | |
362 vqmovun.s16 d6, q3 | |
363 vld1.64 {d0-d1}, [r0,:128]! | |
364 vaddw.u8 q0, q0, d16 | |
365 vst1.64 {d4}, [r3,:64], r2 | |
366 vld1.64 {d17}, [r1,:64], r2 | |
367 vld1.64 {d2-d3}, [r0,:128]! | |
368 vaddw.u8 q1, q1, d17 | |
369 vst1.64 {d6}, [r3,:64], r2 | |
370 vqmovun.s16 d0, q0 | |
371 vld1.64 {d18}, [r1,:64], r2 | |
372 vld1.64 {d4-d5}, [r0,:128]! | |
373 vaddw.u8 q2, q2, d18 | |
374 vst1.64 {d0}, [r3,:64], r2 | |
375 vqmovun.s16 d2, q1 | |
376 vld1.64 {d19}, [r1,:64], r2 | |
377 vqmovun.s16 d4, q2 | |
378 vld1.64 {d6-d7}, [r0,:128]! | |
379 vaddw.u8 q3, q3, d19 | |
380 vst1.64 {d2}, [r3,:64], r2 | |
381 vqmovun.s16 d6, q3 | |
382 vst1.64 {d4}, [r3,:64], r2 | |
383 vst1.64 {d6}, [r3,:64], r2 | |
384 bx lr | |
385 .endfunc | |
386 | |
8492 | 387 function ff_float_to_int16_neon, export=1 |
388 subs r2, r2, #8 | |
389 vld1.64 {d0-d1}, [r1,:128]! | |
390 vcvt.s32.f32 q8, q0, #16 | |
391 vld1.64 {d2-d3}, [r1,:128]! | |
392 vcvt.s32.f32 q9, q1, #16 | |
393 beq 3f | |
394 bics ip, r2, #15 | |
395 beq 2f | |
396 1: subs ip, ip, #16 | |
397 vshrn.s32 d4, q8, #16 | |
398 vld1.64 {d0-d1}, [r1,:128]! | |
399 vcvt.s32.f32 q0, q0, #16 | |
400 vshrn.s32 d5, q9, #16 | |
401 vld1.64 {d2-d3}, [r1,:128]! | |
402 vcvt.s32.f32 q1, q1, #16 | |
403 vshrn.s32 d6, q0, #16 | |
404 vst1.64 {d4-d5}, [r0,:128]! | |
405 vshrn.s32 d7, q1, #16 | |
406 vld1.64 {d16-d17},[r1,:128]! | |
407 vcvt.s32.f32 q8, q8, #16 | |
408 vld1.64 {d18-d19},[r1,:128]! | |
409 vcvt.s32.f32 q9, q9, #16 | |
410 vst1.64 {d6-d7}, [r0,:128]! | |
411 bne 1b | |
412 ands r2, r2, #15 | |
413 beq 3f | |
414 2: vld1.64 {d0-d1}, [r1,:128]! | |
415 vshrn.s32 d4, q8, #16 | |
416 vcvt.s32.f32 q0, q0, #16 | |
417 vld1.64 {d2-d3}, [r1,:128]! | |
418 vshrn.s32 d5, q9, #16 | |
419 vcvt.s32.f32 q1, q1, #16 | |
420 vshrn.s32 d6, q0, #16 | |
421 vst1.64 {d4-d5}, [r0,:128]! | |
422 vshrn.s32 d7, q1, #16 | |
423 vst1.64 {d6-d7}, [r0,:128]! | |
424 bx lr | |
425 3: vshrn.s32 d4, q8, #16 | |
426 vshrn.s32 d5, q9, #16 | |
427 vst1.64 {d4-d5}, [r0,:128]! | |
428 bx lr | |
429 .endfunc | |
430 | |
431 function ff_float_to_int16_interleave_neon, export=1 | |
432 cmp r3, #2 | |
433 ldrlt r1, [r1] | |
434 blt ff_float_to_int16_neon | |
435 bne 4f | |
436 | |
437 ldr r3, [r1] | |
438 ldr r1, [r1, #4] | |
439 | |
440 subs r2, r2, #8 | |
441 vld1.64 {d0-d1}, [r3,:128]! | |
442 vcvt.s32.f32 q8, q0, #16 | |
443 vld1.64 {d2-d3}, [r3,:128]! | |
444 vcvt.s32.f32 q9, q1, #16 | |
445 vld1.64 {d20-d21},[r1,:128]! | |
446 vcvt.s32.f32 q10, q10, #16 | |
447 vld1.64 {d22-d23},[r1,:128]! | |
448 vcvt.s32.f32 q11, q11, #16 | |
449 beq 3f | |
450 bics ip, r2, #15 | |
451 beq 2f | |
452 1: subs ip, ip, #16 | |
453 vld1.64 {d0-d1}, [r3,:128]! | |
454 vcvt.s32.f32 q0, q0, #16 | |
455 vsri.32 q10, q8, #16 | |
456 vld1.64 {d2-d3}, [r3,:128]! | |
457 vcvt.s32.f32 q1, q1, #16 | |
458 vld1.64 {d24-d25},[r1,:128]! | |
459 vcvt.s32.f32 q12, q12, #16 | |
460 vld1.64 {d26-d27},[r1,:128]! | |
461 vsri.32 q11, q9, #16 | |
462 vst1.64 {d20-d21},[r0,:128]! | |
463 vcvt.s32.f32 q13, q13, #16 | |
464 vst1.64 {d22-d23},[r0,:128]! | |
465 vsri.32 q12, q0, #16 | |
466 vld1.64 {d16-d17},[r3,:128]! | |
467 vsri.32 q13, q1, #16 | |
468 vst1.64 {d24-d25},[r0,:128]! | |
469 vcvt.s32.f32 q8, q8, #16 | |
470 vld1.64 {d18-d19},[r3,:128]! | |
471 vcvt.s32.f32 q9, q9, #16 | |
472 vld1.64 {d20-d21},[r1,:128]! | |
473 vcvt.s32.f32 q10, q10, #16 | |
474 vld1.64 {d22-d23},[r1,:128]! | |
475 vcvt.s32.f32 q11, q11, #16 | |
476 vst1.64 {d26-d27},[r0,:128]! | |
477 bne 1b | |
478 ands r2, r2, #15 | |
479 beq 3f | |
480 2: vsri.32 q10, q8, #16 | |
481 vld1.64 {d0-d1}, [r3,:128]! | |
482 vcvt.s32.f32 q0, q0, #16 | |
483 vld1.64 {d2-d3}, [r3,:128]! | |
484 vcvt.s32.f32 q1, q1, #16 | |
485 vld1.64 {d24-d25},[r1,:128]! | |
486 vcvt.s32.f32 q12, q12, #16 | |
487 vsri.32 q11, q9, #16 | |
488 vld1.64 {d26-d27},[r1,:128]! | |
489 vcvt.s32.f32 q13, q13, #16 | |
490 vst1.64 {d20-d21},[r0,:128]! | |
491 vsri.32 q12, q0, #16 | |
492 vst1.64 {d22-d23},[r0,:128]! | |
493 vsri.32 q13, q1, #16 | |
494 vst1.64 {d24-d27},[r0,:128]! | |
495 bx lr | |
496 3: vsri.32 q10, q8, #16 | |
497 vsri.32 q11, q9, #16 | |
498 vst1.64 {d20-d23},[r0,:128]! | |
499 bx lr | |
500 | |
501 4: push {r4-r8,lr} | |
502 cmp r3, #4 | |
503 lsl ip, r3, #1 | |
504 blt 4f | |
505 | |
506 @ 4 channels | |
507 5: ldmia r1!, {r4-r7} | |
508 mov lr, r2 | |
509 mov r8, r0 | |
510 vld1.64 {d16-d17},[r4,:128]! | |
511 vcvt.s32.f32 q8, q8, #16 | |
512 vld1.64 {d18-d19},[r5,:128]! | |
513 vcvt.s32.f32 q9, q9, #16 | |
514 vld1.64 {d20-d21},[r6,:128]! | |
515 vcvt.s32.f32 q10, q10, #16 | |
516 vld1.64 {d22-d23},[r7,:128]! | |
517 vcvt.s32.f32 q11, q11, #16 | |
518 6: subs lr, lr, #8 | |
519 vld1.64 {d0-d1}, [r4,:128]! | |
520 vcvt.s32.f32 q0, q0, #16 | |
521 vsri.32 q9, q8, #16 | |
522 vld1.64 {d2-d3}, [r5,:128]! | |
523 vcvt.s32.f32 q1, q1, #16 | |
524 vsri.32 q11, q10, #16 | |
525 vld1.64 {d4-d5}, [r6,:128]! | |
526 vcvt.s32.f32 q2, q2, #16 | |
527 vzip.32 d18, d22 | |
528 vld1.64 {d6-d7}, [r7,:128]! | |
529 vcvt.s32.f32 q3, q3, #16 | |
530 vzip.32 d19, d23 | |
531 vst1.64 {d18}, [r8], ip | |
532 vsri.32 q1, q0, #16 | |
533 vst1.64 {d22}, [r8], ip | |
534 vsri.32 q3, q2, #16 | |
535 vst1.64 {d19}, [r8], ip | |
536 vzip.32 d2, d6 | |
537 vst1.64 {d23}, [r8], ip | |
538 vzip.32 d3, d7 | |
539 beq 7f | |
540 vld1.64 {d16-d17},[r4,:128]! | |
541 vcvt.s32.f32 q8, q8, #16 | |
542 vst1.64 {d2}, [r8], ip | |
543 vld1.64 {d18-d19},[r5,:128]! | |
544 vcvt.s32.f32 q9, q9, #16 | |
545 vst1.64 {d6}, [r8], ip | |
546 vld1.64 {d20-d21},[r6,:128]! | |
547 vcvt.s32.f32 q10, q10, #16 | |
548 vst1.64 {d3}, [r8], ip | |
549 vld1.64 {d22-d23},[r7,:128]! | |
550 vcvt.s32.f32 q11, q11, #16 | |
551 vst1.64 {d7}, [r8], ip | |
552 b 6b | |
553 7: vst1.64 {d2}, [r8], ip | |
554 vst1.64 {d6}, [r8], ip | |
555 vst1.64 {d3}, [r8], ip | |
556 vst1.64 {d7}, [r8], ip | |
557 subs r3, r3, #4 | |
558 popeq {r4-r8,pc} | |
559 cmp r3, #4 | |
560 add r0, r0, #8 | |
561 bge 5b | |
562 | |
563 @ 2 channels | |
564 4: cmp r3, #2 | |
565 blt 4f | |
566 ldmia r1!, {r4-r5} | |
567 mov lr, r2 | |
568 mov r8, r0 | |
569 tst lr, #8 | |
570 vld1.64 {d16-d17},[r4,:128]! | |
571 vcvt.s32.f32 q8, q8, #16 | |
572 vld1.64 {d18-d19},[r5,:128]! | |
573 vcvt.s32.f32 q9, q9, #16 | |
574 vld1.64 {d20-d21},[r4,:128]! | |
575 vcvt.s32.f32 q10, q10, #16 | |
576 vld1.64 {d22-d23},[r5,:128]! | |
577 vcvt.s32.f32 q11, q11, #16 | |
578 beq 6f | |
579 subs lr, lr, #8 | |
580 beq 7f | |
581 vsri.32 d18, d16, #16 | |
582 vsri.32 d19, d17, #16 | |
583 vld1.64 {d16-d17},[r4,:128]! | |
584 vcvt.s32.f32 q8, q8, #16 | |
585 vst1.32 {d18[0]}, [r8], ip | |
586 vsri.32 d22, d20, #16 | |
587 vst1.32 {d18[1]}, [r8], ip | |
588 vsri.32 d23, d21, #16 | |
589 vst1.32 {d19[0]}, [r8], ip | |
590 vst1.32 {d19[1]}, [r8], ip | |
591 vld1.64 {d18-d19},[r5,:128]! | |
592 vcvt.s32.f32 q9, q9, #16 | |
593 vst1.32 {d22[0]}, [r8], ip | |
594 vst1.32 {d22[1]}, [r8], ip | |
595 vld1.64 {d20-d21},[r4,:128]! | |
596 vcvt.s32.f32 q10, q10, #16 | |
597 vst1.32 {d23[0]}, [r8], ip | |
598 vst1.32 {d23[1]}, [r8], ip | |
599 vld1.64 {d22-d23},[r5,:128]! | |
600 vcvt.s32.f32 q11, q11, #16 | |
601 6: subs lr, lr, #16 | |
602 vld1.64 {d0-d1}, [r4,:128]! | |
603 vcvt.s32.f32 q0, q0, #16 | |
604 vsri.32 d18, d16, #16 | |
605 vld1.64 {d2-d3}, [r5,:128]! | |
606 vcvt.s32.f32 q1, q1, #16 | |
607 vsri.32 d19, d17, #16 | |
608 vld1.64 {d4-d5}, [r4,:128]! | |
609 vcvt.s32.f32 q2, q2, #16 | |
610 vld1.64 {d6-d7}, [r5,:128]! | |
611 vcvt.s32.f32 q3, q3, #16 | |
612 vst1.32 {d18[0]}, [r8], ip | |
613 vsri.32 d22, d20, #16 | |
614 vst1.32 {d18[1]}, [r8], ip | |
615 vsri.32 d23, d21, #16 | |
616 vst1.32 {d19[0]}, [r8], ip | |
617 vsri.32 d2, d0, #16 | |
618 vst1.32 {d19[1]}, [r8], ip | |
619 vsri.32 d3, d1, #16 | |
620 vst1.32 {d22[0]}, [r8], ip | |
621 vsri.32 d6, d4, #16 | |
622 vst1.32 {d22[1]}, [r8], ip | |
623 vsri.32 d7, d5, #16 | |
624 vst1.32 {d23[0]}, [r8], ip | |
625 vst1.32 {d23[1]}, [r8], ip | |
626 beq 6f | |
627 vld1.64 {d16-d17},[r4,:128]! | |
628 vcvt.s32.f32 q8, q8, #16 | |
629 vst1.32 {d2[0]}, [r8], ip | |
630 vst1.32 {d2[1]}, [r8], ip | |
631 vld1.64 {d18-d19},[r5,:128]! | |
632 vcvt.s32.f32 q9, q9, #16 | |
633 vst1.32 {d3[0]}, [r8], ip | |
634 vst1.32 {d3[1]}, [r8], ip | |
635 vld1.64 {d20-d21},[r4,:128]! | |
636 vcvt.s32.f32 q10, q10, #16 | |
637 vst1.32 {d6[0]}, [r8], ip | |
638 vst1.32 {d6[1]}, [r8], ip | |
639 vld1.64 {d22-d23},[r5,:128]! | |
640 vcvt.s32.f32 q11, q11, #16 | |
641 vst1.32 {d7[0]}, [r8], ip | |
642 vst1.32 {d7[1]}, [r8], ip | |
643 bgt 6b | |
644 6: vst1.32 {d2[0]}, [r8], ip | |
645 vst1.32 {d2[1]}, [r8], ip | |
646 vst1.32 {d3[0]}, [r8], ip | |
647 vst1.32 {d3[1]}, [r8], ip | |
648 vst1.32 {d6[0]}, [r8], ip | |
649 vst1.32 {d6[1]}, [r8], ip | |
650 vst1.32 {d7[0]}, [r8], ip | |
651 vst1.32 {d7[1]}, [r8], ip | |
652 b 8f | |
653 7: vsri.32 d18, d16, #16 | |
654 vsri.32 d19, d17, #16 | |
655 vst1.32 {d18[0]}, [r8], ip | |
656 vsri.32 d22, d20, #16 | |
657 vst1.32 {d18[1]}, [r8], ip | |
658 vsri.32 d23, d21, #16 | |
659 vst1.32 {d19[0]}, [r8], ip | |
660 vst1.32 {d19[1]}, [r8], ip | |
661 vst1.32 {d22[0]}, [r8], ip | |
662 vst1.32 {d22[1]}, [r8], ip | |
663 vst1.32 {d23[0]}, [r8], ip | |
664 vst1.32 {d23[1]}, [r8], ip | |
665 8: subs r3, r3, #2 | |
666 add r0, r0, #4 | |
667 popeq {r4-r8,pc} | |
668 | |
669 @ 1 channel | |
670 4: ldr r4, [r1],#4 | |
671 tst r2, #8 | |
672 mov lr, r2 | |
673 mov r5, r0 | |
674 vld1.64 {d0-d1}, [r4,:128]! | |
675 vcvt.s32.f32 q0, q0, #16 | |
676 vld1.64 {d2-d3}, [r4,:128]! | |
677 vcvt.s32.f32 q1, q1, #16 | |
678 bne 8f | |
679 6: subs lr, lr, #16 | |
680 vld1.64 {d4-d5}, [r4,:128]! | |
681 vcvt.s32.f32 q2, q2, #16 | |
682 vld1.64 {d6-d7}, [r4,:128]! | |
683 vcvt.s32.f32 q3, q3, #16 | |
684 vst1.16 {d0[1]}, [r5,:16], ip | |
685 vst1.16 {d0[3]}, [r5,:16], ip | |
686 vst1.16 {d1[1]}, [r5,:16], ip | |
687 vst1.16 {d1[3]}, [r5,:16], ip | |
688 vst1.16 {d2[1]}, [r5,:16], ip | |
689 vst1.16 {d2[3]}, [r5,:16], ip | |
690 vst1.16 {d3[1]}, [r5,:16], ip | |
691 vst1.16 {d3[3]}, [r5,:16], ip | |
692 beq 7f | |
693 vld1.64 {d0-d1}, [r4,:128]! | |
694 vcvt.s32.f32 q0, q0, #16 | |
695 vld1.64 {d2-d3}, [r4,:128]! | |
696 vcvt.s32.f32 q1, q1, #16 | |
697 7: vst1.16 {d4[1]}, [r5,:16], ip | |
698 vst1.16 {d4[3]}, [r5,:16], ip | |
699 vst1.16 {d5[1]}, [r5,:16], ip | |
700 vst1.16 {d5[3]}, [r5,:16], ip | |
701 vst1.16 {d6[1]}, [r5,:16], ip | |
702 vst1.16 {d6[3]}, [r5,:16], ip | |
703 vst1.16 {d7[1]}, [r5,:16], ip | |
704 vst1.16 {d7[3]}, [r5,:16], ip | |
705 bgt 6b | |
706 pop {r4-r8,pc} | |
707 8: subs lr, lr, #8 | |
708 vst1.16 {d0[1]}, [r5,:16], ip | |
709 vst1.16 {d0[3]}, [r5,:16], ip | |
710 vst1.16 {d1[1]}, [r5,:16], ip | |
711 vst1.16 {d1[3]}, [r5,:16], ip | |
712 vst1.16 {d2[1]}, [r5,:16], ip | |
713 vst1.16 {d2[3]}, [r5,:16], ip | |
714 vst1.16 {d3[1]}, [r5,:16], ip | |
715 vst1.16 {d3[3]}, [r5,:16], ip | |
716 popeq {r4-r8,pc} | |
717 vld1.64 {d0-d1}, [r4,:128]! | |
718 vcvt.s32.f32 q0, q0, #16 | |
719 vld1.64 {d2-d3}, [r4,:128]! | |
720 vcvt.s32.f32 q1, q1, #16 | |
721 b 6b | |
722 .endfunc | |
8697 | 723 |
724 function ff_vector_fmul_neon, export=1 | |
725 mov r3, r0 | |
726 subs r2, r2, #8 | |
727 vld1.64 {d0-d3}, [r0,:128]! | |
728 vld1.64 {d4-d7}, [r1,:128]! | |
729 vmul.f32 q8, q0, q2 | |
730 vmul.f32 q9, q1, q3 | |
731 beq 3f | |
732 bics ip, r2, #15 | |
733 beq 2f | |
734 1: subs ip, ip, #16 | |
735 vld1.64 {d0-d1}, [r0,:128]! | |
736 vld1.64 {d4-d5}, [r1,:128]! | |
737 vmul.f32 q10, q0, q2 | |
738 vld1.64 {d2-d3}, [r0,:128]! | |
739 vld1.64 {d6-d7}, [r1,:128]! | |
740 vmul.f32 q11, q1, q3 | |
741 vst1.64 {d16-d19},[r3,:128]! | |
742 vld1.64 {d0-d1}, [r0,:128]! | |
743 vld1.64 {d4-d5}, [r1,:128]! | |
744 vmul.f32 q8, q0, q2 | |
745 vld1.64 {d2-d3}, [r0,:128]! | |
746 vld1.64 {d6-d7}, [r1,:128]! | |
747 vmul.f32 q9, q1, q3 | |
748 vst1.64 {d20-d23},[r3,:128]! | |
749 bne 1b | |
750 ands r2, r2, #15 | |
751 beq 3f | |
752 2: vld1.64 {d0-d1}, [r0,:128]! | |
753 vld1.64 {d4-d5}, [r1,:128]! | |
754 vst1.64 {d16-d17},[r3,:128]! | |
755 vmul.f32 q8, q0, q2 | |
756 vld1.64 {d2-d3}, [r0,:128]! | |
757 vld1.64 {d6-d7}, [r1,:128]! | |
758 vst1.64 {d18-d19},[r3,:128]! | |
759 vmul.f32 q9, q1, q3 | |
760 3: vst1.64 {d16-d19},[r3,:128]! | |
761 bx lr | |
762 .endfunc | |
8698 | 763 |
764 function ff_vector_fmul_window_neon, export=1 | |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
765 VFP vdup.32 q8, d0[0] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
8698 | 767 push {r4,r5,lr} |
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
768 VFP ldr lr, [sp, #12] |
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
769 NOVFP ldr lr, [sp, #16] |
8698 | 770 sub r2, r2, #8 |
771 sub r5, lr, #2 | |
772 add r2, r2, r5, lsl #2 | |
773 add r4, r3, r5, lsl #3 | |
774 add ip, r0, r5, lsl #3 | |
775 mov r5, #-16 | |
776 vld1.64 {d0,d1}, [r1,:128]! | |
777 vld1.64 {d2,d3}, [r2,:128], r5 | |
778 vld1.64 {d4,d5}, [r3,:128]! | |
779 vld1.64 {d6,d7}, [r4,:128], r5 | |
780 1: subs lr, lr, #4 | |
781 vmov q11, q8 | |
782 vmla.f32 d22, d0, d4 | |
783 vmov q10, q8 | |
784 vmla.f32 d23, d1, d5 | |
785 vrev64.32 q3, q3 | |
786 vmla.f32 d20, d0, d7 | |
787 vrev64.32 q1, q1 | |
788 vmla.f32 d21, d1, d6 | |
789 beq 2f | |
790 vmla.f32 d22, d3, d7 | |
791 vld1.64 {d0,d1}, [r1,:128]! | |
792 vmla.f32 d23, d2, d6 | |
793 vld1.64 {d18,d19},[r2,:128], r5 | |
794 vmls.f32 d20, d3, d4 | |
795 vld1.64 {d24,d25},[r3,:128]! | |
796 vmls.f32 d21, d2, d5 | |
797 vld1.64 {d6,d7}, [r4,:128], r5 | |
798 vmov q1, q9 | |
799 vrev64.32 q11, q11 | |
800 vmov q2, q12 | |
801 vswp d22, d23 | |
802 vst1.64 {d20,d21},[r0,:128]! | |
803 vst1.64 {d22,d23},[ip,:128], r5 | |
804 b 1b | |
805 2: vmla.f32 d22, d3, d7 | |
806 vmla.f32 d23, d2, d6 | |
807 vmls.f32 d20, d3, d4 | |
808 vmls.f32 d21, d2, d5 | |
809 vrev64.32 q11, q11 | |
810 vswp d22, d23 | |
811 vst1.64 {d20,d21},[r0,:128]! | |
812 vst1.64 {d22,d23},[ip,:128], r5 | |
813 pop {r4,r5,pc} | |
814 .endfunc | |
10046 | 815 |
816 #if CONFIG_VORBIS_DECODER | |
817 function ff_vorbis_inverse_coupling_neon, export=1 | |
818 vmov.i32 q10, #1<<31 | |
819 subs r2, r2, #4 | |
820 mov r3, r0 | |
821 mov r12, r1 | |
822 beq 3f | |
823 | |
824 vld1.32 {d24-d25},[r1,:128]! | |
825 vld1.32 {d22-d23},[r0,:128]! | |
826 vcle.s32 q8, q12, #0 | |
827 vand q9, q11, q10 | |
828 veor q12, q12, q9 | |
829 vand q2, q12, q8 | |
830 vbic q3, q12, q8 | |
831 vadd.f32 q12, q11, q2 | |
832 vsub.f32 q11, q11, q3 | |
833 1: vld1.32 {d2-d3}, [r1,:128]! | |
834 vld1.32 {d0-d1}, [r0,:128]! | |
835 vcle.s32 q8, q1, #0 | |
836 vand q9, q0, q10 | |
837 veor q1, q1, q9 | |
838 vst1.32 {d24-d25},[r3, :128]! | |
839 vst1.32 {d22-d23},[r12,:128]! | |
840 vand q2, q1, q8 | |
841 vbic q3, q1, q8 | |
842 vadd.f32 q1, q0, q2 | |
843 vsub.f32 q0, q0, q3 | |
844 subs r2, r2, #8 | |
845 ble 2f | |
846 vld1.32 {d24-d25},[r1,:128]! | |
847 vld1.32 {d22-d23},[r0,:128]! | |
848 vcle.s32 q8, q12, #0 | |
849 vand q9, q11, q10 | |
850 veor q12, q12, q9 | |
851 vst1.32 {d2-d3}, [r3, :128]! | |
852 vst1.32 {d0-d1}, [r12,:128]! | |
853 vand q2, q12, q8 | |
854 vbic q3, q12, q8 | |
855 vadd.f32 q12, q11, q2 | |
856 vsub.f32 q11, q11, q3 | |
857 b 1b | |
858 | |
859 2: vst1.32 {d2-d3}, [r3, :128]! | |
860 vst1.32 {d0-d1}, [r12,:128]! | |
861 bxlt lr | |
862 | |
863 3: vld1.32 {d2-d3}, [r1,:128] | |
864 vld1.32 {d0-d1}, [r0,:128] | |
865 vcle.s32 q8, q1, #0 | |
866 vand q9, q0, q10 | |
867 veor q1, q1, q9 | |
868 vand q2, q1, q8 | |
869 vbic q3, q1, q8 | |
870 vadd.f32 q1, q0, q2 | |
871 vsub.f32 q0, q0, q3 | |
872 vst1.32 {d2-d3}, [r0,:128]! | |
873 vst1.32 {d0-d1}, [r1,:128]! | |
874 bx lr | |
875 .endfunc | |
876 #endif | |
10221 | 877 |
878 function ff_vector_fmul_scalar_neon, export=1 | |
879 VFP len .req r2 | |
880 NOVFP len .req r3 | |
881 VFP vdup.32 q8, d0[0] | |
882 NOVFP vdup.32 q8, r2 | |
883 bics r12, len, #15 | |
884 beq 3f | |
885 vld1.32 {q0},[r1,:128]! | |
886 vld1.32 {q1},[r1,:128]! | |
887 1: vmul.f32 q0, q0, q8 | |
888 vld1.32 {q2},[r1,:128]! | |
889 vmul.f32 q1, q1, q8 | |
890 vld1.32 {q3},[r1,:128]! | |
891 vmul.f32 q2, q2, q8 | |
892 vst1.32 {q0},[r0,:128]! | |
893 vmul.f32 q3, q3, q8 | |
894 vst1.32 {q1},[r0,:128]! | |
895 subs r12, r12, #16 | |
896 beq 2f | |
897 vld1.32 {q0},[r1,:128]! | |
898 vst1.32 {q2},[r0,:128]! | |
899 vld1.32 {q1},[r1,:128]! | |
900 vst1.32 {q3},[r0,:128]! | |
901 b 1b | |
902 2: vst1.32 {q2},[r0,:128]! | |
903 vst1.32 {q3},[r0,:128]! | |
904 ands len, len, #15 | |
905 bxeq lr | |
906 3: vld1.32 {q0},[r1,:128]! | |
907 vmul.f32 q0, q0, q8 | |
908 vst1.32 {q0},[r0,:128]! | |
909 subs len, len, #4 | |
910 bgt 3b | |
911 bx lr | |
912 .unreq len | |
913 .endfunc | |
914 | |
915 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
916 VFP vdup.32 d16, d0[0] | |
917 NOVFP vdup.32 d16, r3 | |
918 NOVFP ldr r3, [sp] | |
919 vld1.32 {d0},[r1,:64]! | |
920 vld1.32 {d1},[r1,:64]! | |
921 1: subs r3, r3, #4 | |
922 vmul.f32 d4, d0, d16 | |
923 vmul.f32 d5, d1, d16 | |
924 ldr r12, [r2], #4 | |
925 vld1.32 {d2},[r12,:64] | |
926 ldr r12, [r2], #4 | |
927 vld1.32 {d3},[r12,:64] | |
928 vmul.f32 d4, d4, d2 | |
929 vmul.f32 d5, d5, d3 | |
930 beq 2f | |
931 vld1.32 {d0},[r1,:64]! | |
932 vld1.32 {d1},[r1,:64]! | |
933 vst1.32 {d4},[r0,:64]! | |
934 vst1.32 {d5},[r0,:64]! | |
935 b 1b | |
936 2: vst1.32 {d4},[r0,:64]! | |
937 vst1.32 {d5},[r0,:64]! | |
938 bx lr | |
939 .endfunc | |
940 | |
941 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
942 VFP vdup.32 q10, d0[0] | |
943 NOVFP vdup.32 q10, r3 | |
944 NOVFP ldr r3, [sp] | |
945 push {lr} | |
946 bics lr, r3, #7 | |
947 beq 3f | |
948 vld1.32 {q0},[r1,:128]! | |
949 vld1.32 {q2},[r1,:128]! | |
950 1: ldr r12, [r2], #4 | |
951 vld1.32 {q1},[r12,:128] | |
952 ldr r12, [r2], #4 | |
953 vld1.32 {q3},[r12,:128] | |
954 vmul.f32 q8, q0, q10 | |
955 vmul.f32 q8, q8, q1 | |
956 vmul.f32 q9, q2, q10 | |
957 vmul.f32 q9, q9, q3 | |
958 subs lr, lr, #8 | |
959 beq 2f | |
960 vld1.32 {q0},[r1,:128]! | |
961 vld1.32 {q2},[r1,:128]! | |
962 vst1.32 {q8},[r0,:128]! | |
963 vst1.32 {q9},[r0,:128]! | |
964 b 1b | |
965 2: vst1.32 {q8},[r0,:128]! | |
966 vst1.32 {q9},[r0,:128]! | |
967 ands r3, r3, #7 | |
968 popeq {pc} | |
969 3: vld1.32 {q0},[r1,:128]! | |
970 ldr r12, [r2], #4 | |
971 vld1.32 {q1},[r12,:128] | |
972 vmul.f32 q0, q0, q10 | |
973 vmul.f32 q0, q0, q1 | |
974 vst1.32 {q0},[r0,:128]! | |
975 subs r3, r3, #4 | |
976 bgt 3b | |
977 pop {pc} | |
978 .endfunc | |
979 | |
980 function ff_sv_fmul_scalar_2_neon, export=1 | |
981 VFP len .req r2 | |
982 NOVFP len .req r3 | |
983 VFP vdup.32 q8, d0[0] | |
984 NOVFP vdup.32 q8, r2 | |
985 ldr r12, [r1], #4 | |
986 vld1.32 {d0},[r12,:64] | |
987 ldr r12, [r1], #4 | |
988 vld1.32 {d1},[r12,:64] | |
989 1: vmul.f32 q1, q0, q8 | |
990 subs len, len, #4 | |
991 beq 2f | |
992 ldr r12, [r1], #4 | |
993 vld1.32 {d0},[r12,:64] | |
994 ldr r12, [r1], #4 | |
995 vld1.32 {d1},[r12,:64] | |
996 vst1.32 {q1},[r0,:128]! | |
997 b 1b | |
998 2: vst1.32 {q1},[r0,:128]! | |
999 bx lr | |
1000 .unreq len | |
1001 .endfunc | |
1002 | |
1003 function ff_sv_fmul_scalar_4_neon, export=1 | |
1004 VFP len .req r2 | |
1005 NOVFP len .req r3 | |
1006 VFP vdup.32 q8, d0[0] | |
1007 NOVFP vdup.32 q8, r2 | |
1008 1: ldr r12, [r1], #4 | |
1009 vld1.32 {q0},[r12,:128] | |
1010 vmul.f32 q0, q0, q8 | |
1011 vst1.32 {q0},[r0,:128]! | |
1012 subs len, len, #4 | |
1013 bgt 1b | |
1014 bx lr | |
1015 .unreq len | |
1016 .endfunc | |
1017 | |
1018 function ff_butterflies_float_neon, export=1 | |
1019 1: vld1.32 {q0},[r0,:128] | |
1020 vld1.32 {q1},[r1,:128] | |
1021 vsub.f32 q2, q0, q1 | |
1022 vadd.f32 q1, q0, q1 | |
1023 vst1.32 {q2},[r1,:128]! | |
1024 vst1.32 {q1},[r0,:128]! | |
1025 subs r2, r2, #4 | |
1026 bgt 1b | |
1027 bx lr | |
1028 .endfunc | |
10228 | 1029 |
1030 function ff_scalarproduct_float_neon, export=1 | |
1031 vmov.f32 q2, #0.0 | |
1032 1: vld1.32 {q0},[r0,:128]! | |
1033 vld1.32 {q1},[r1,:128]! | |
1034 vmla.f32 q2, q0, q1 | |
1035 subs r2, r2, #4 | |
1036 bgt 1b | |
1037 vadd.f32 d0, d4, d5 | |
1038 vpadd.f32 d0, d0, d0 | |
1039 NOVFP vmov.32 r0, d0[0] | |
1040 bx lr | |
1041 .endfunc | |
10253 | 1042 |
1043 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1044 VFP vdup.32 q0, d0[0] | |
1045 VFP len .req r2 | |
1046 NOVFP vdup.32 q0, r2 | |
1047 NOVFP len .req r3 | |
1048 | |
1049 vld1.32 {q1},[r1,:128]! | |
1050 vcvt.f32.s32 q3, q1 | |
1051 vld1.32 {q2},[r1,:128]! | |
1052 vcvt.f32.s32 q8, q2 | |
1053 1: subs len, len, #8 | |
1054 pld [r1, #16] | |
1055 vmul.f32 q9, q3, q0 | |
1056 vmul.f32 q10, q8, q0 | |
1057 beq 2f | |
1058 vld1.32 {q1},[r1,:128]! | |
1059 vcvt.f32.s32 q3, q1 | |
1060 vld1.32 {q2},[r1,:128]! | |
1061 vcvt.f32.s32 q8, q2 | |
1062 vst1.32 {q9}, [r0,:128]! | |
1063 vst1.32 {q10},[r0,:128]! | |
1064 b 1b | |
1065 2: vst1.32 {q9}, [r0,:128]! | |
1066 vst1.32 {q10},[r0,:128]! | |
1067 bx lr | |
1068 .unreq len | |
1069 .endfunc | |
10274 | 1070 |
1071 function ff_vector_fmul_reverse_neon, export=1 | |
1072 add r2, r2, r3, lsl #2 | |
1073 sub r2, r2, #32 | |
1074 mov r12, #-32 | |
1075 vld1.32 {q0-q1}, [r1,:128]! | |
1076 vld1.32 {q2-q3}, [r2,:128], r12 | |
1077 1: pld [r1, #32] | |
1078 vrev64.32 q3, q3 | |
1079 vmul.f32 d16, d0, d7 | |
1080 vmul.f32 d17, d1, d6 | |
1081 pld [r2, #-32] | |
1082 vrev64.32 q2, q2 | |
1083 vmul.f32 d18, d2, d5 | |
1084 vmul.f32 d19, d3, d4 | |
1085 subs r3, r3, #8 | |
1086 beq 2f | |
1087 vld1.32 {q0-q1}, [r1,:128]! | |
1088 vld1.32 {q2-q3}, [r2,:128], r12 | |
1089 vst1.32 {q8-q9}, [r0,:128]! | |
1090 b 1b | |
1091 2: vst1.32 {q8-q9}, [r0,:128]! | |
1092 bx lr | |
1093 .endfunc | |
10276 | 1094 |
10302 | 1095 function ff_vector_fmul_add_neon, export=1 |
1096 ldr r12, [sp] | |
1097 vld1.32 {q0-q1}, [r1,:128]! | |
1098 vld1.32 {q8-q9}, [r2,:128]! | |
1099 vld1.32 {q2-q3}, [r3,:128]! | |
1100 vmul.f32 q10, q0, q8 | |
1101 vmul.f32 q11, q1, q9 | |
1102 1: vadd.f32 q12, q2, q10 | |
1103 vadd.f32 q13, q3, q11 | |
1104 pld [r1, #16] | |
1105 pld [r2, #16] | |
1106 pld [r3, #16] | |
1107 subs r12, r12, #8 | |
1108 beq 2f | |
1109 vld1.32 {q0}, [r1,:128]! | |
1110 vld1.32 {q8}, [r2,:128]! | |
1111 vmul.f32 q10, q0, q8 | |
1112 vld1.32 {q1}, [r1,:128]! | |
1113 vld1.32 {q9}, [r2,:128]! | |
1114 vmul.f32 q11, q1, q9 | |
1115 vld1.32 {q2-q3}, [r3,:128]! | |
1116 vst1.32 {q12-q13},[r0,:128]! | |
1117 b 1b | |
1118 2: vst1.32 {q12-q13},[r0,:128]! | |
1119 bx lr | |
1120 .endfunc | |
1121 | |
10276 | 1122 function ff_vector_clipf_neon, export=1 |
1123 VFP vdup.32 q1, d0[1] | |
1124 VFP vdup.32 q0, d0[0] | |
1125 NOVFP vdup.32 q0, r2 | |
1126 NOVFP vdup.32 q1, r3 | |
1127 NOVFP ldr r2, [sp] | |
1128 vld1.f32 {q2},[r1,:128]! | |
1129 vmin.f32 q10, q2, q1 | |
1130 vld1.f32 {q3},[r1,:128]! | |
1131 vmin.f32 q11, q3, q1 | |
1132 1: vmax.f32 q8, q10, q0 | |
1133 vmax.f32 q9, q11, q0 | |
1134 subs r2, r2, #8 | |
1135 beq 2f | |
1136 vld1.f32 {q2},[r1,:128]! | |
1137 vmin.f32 q10, q2, q1 | |
1138 vld1.f32 {q3},[r1,:128]! | |
1139 vmin.f32 q11, q3, q1 | |
1140 vst1.f32 {q8},[r0,:128]! | |
1141 vst1.f32 {q9},[r0,:128]! | |
1142 b 1b | |
1143 2: vst1.f32 {q8},[r0,:128]! | |
1144 vst1.f32 {q9},[r0,:128]! | |
1145 bx lr | |
1146 .endfunc |